diff --git a/.github/canary-scale-config.yml b/.github/canary-scale-config.yml index 6aed2790fa..cff1124c2d 100644 --- a/.github/canary-scale-config.yml +++ b/.github/canary-scale-config.yml @@ -77,11 +77,15 @@ runner_types: instance_type: c5.9xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} c.linux.12xlarge.ephemeral: disk_size: 200 instance_type: c5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} c.linux.16xlarge.nvidia.gpu: disk_size: 150 instance_type: g3.16xlarge @@ -103,6 +107,8 @@ runner_types: instance_type: c5.24xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} c.linux.2xlarge: disk_size: 150 instance_type: c5.2xlarge @@ -220,11 +226,15 @@ runner_types: instance_type: t4g.2xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} c.linux.arm64.m7g.4xlarge.ephemeral: disk_size: 256 instance_type: m7g.4xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} c.linux.arm64.m7g.metal: disk_size: 256 instance_type: m7g.metal @@ -238,6 +248,8 @@ runner_types: instance_type: g4dn.xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} c.windows.g4dn.xlarge.nonephemeral: disk_size: 256 instance_type: g4dn.xlarge @@ -251,6 +263,8 @@ runner_types: instance_type: c5d.4xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} c.windows.4xlarge.nonephemeral: disk_size: 256 instance_type: c5d.4xlarge @@ -264,6 +278,8 @@ runner_types: instance_type: p3.2xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} c.windows.8xlarge.nvidia.gpu.nonephemeral: disk_size: 256 instance_type: p3.2xlarge @@ -317,3 +333,5 @@ runner_types: instance_type: r5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} diff --git a/.github/lf-canary-scale-config.yml b/.github/lf-canary-scale-config.yml index 5527a05e74..82e6338ec1 100644 --- a/.github/lf-canary-scale-config.yml +++ b/.github/lf-canary-scale-config.yml @@ -77,11 +77,15 @@ runner_types: instance_type: c5.9xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.c.linux.12xlarge.ephemeral: disk_size: 200 instance_type: c5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.c.linux.16xlarge.nvidia.gpu: disk_size: 150 instance_type: g3.16xlarge @@ -103,6 +107,8 @@ runner_types: instance_type: c5.24xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.c.linux.2xlarge: disk_size: 150 instance_type: c5.2xlarge @@ -220,11 +226,15 @@ runner_types: instance_type: t4g.2xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.c.linux.arm64.m7g.4xlarge.ephemeral: disk_size: 256 instance_type: m7g.4xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.c.linux.arm64.m7g.metal: disk_size: 256 instance_type: m7g.metal @@ -238,6 +248,8 @@ runner_types: instance_type: g4dn.xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} lf.c.windows.g4dn.xlarge.nonephemeral: disk_size: 256 instance_type: g4dn.xlarge @@ -251,6 +263,8 @@ runner_types: instance_type: c5d.4xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} lf.c.windows.4xlarge.nonephemeral: disk_size: 256 instance_type: c5d.4xlarge @@ -264,6 +278,8 @@ runner_types: instance_type: p3.2xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} lf.c.windows.8xlarge.nvidia.gpu.nonephemeral: disk_size: 256 instance_type: p3.2xlarge @@ -317,3 +333,5 @@ runner_types: instance_type: r5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} diff --git a/.github/lf-scale-config.yml b/.github/lf-scale-config.yml index dd3ddcf840..12a1131388 100644 --- a/.github/lf-scale-config.yml +++ b/.github/lf-scale-config.yml @@ -77,11 +77,15 @@ runner_types: instance_type: c5.9xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.linux.12xlarge.ephemeral: disk_size: 200 instance_type: c5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.linux.16xlarge.nvidia.gpu: disk_size: 150 instance_type: g3.16xlarge @@ -103,6 +107,8 @@ runner_types: instance_type: c5.24xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.linux.2xlarge: disk_size: 150 instance_type: c5.2xlarge @@ -220,11 +226,15 @@ runner_types: instance_type: t4g.2xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.linux.arm64.m7g.4xlarge.ephemeral: disk_size: 256 instance_type: m7g.4xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} lf.linux.arm64.m7g.metal: disk_size: 256 instance_type: m7g.metal @@ -238,6 +248,8 @@ runner_types: instance_type: g4dn.xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} lf.windows.g4dn.xlarge.nonephemeral: disk_size: 256 instance_type: g4dn.xlarge @@ -251,6 +263,8 @@ runner_types: instance_type: c5d.4xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} lf.windows.4xlarge.nonephemeral: disk_size: 256 instance_type: c5d.4xlarge @@ -264,6 +278,8 @@ runner_types: instance_type: p3.2xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} lf.windows.8xlarge.nvidia.gpu.nonephemeral: disk_size: 256 instance_type: p3.2xlarge @@ -317,3 +333,5 @@ runner_types: instance_type: r5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} diff --git a/.github/scale-config.yml b/.github/scale-config.yml index 322ae2efe5..fbd92614d4 100644 --- a/.github/scale-config.yml +++ b/.github/scale-config.yml @@ -73,11 +73,15 @@ runner_types: instance_type: c5.9xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} linux.12xlarge.ephemeral: disk_size: 200 instance_type: c5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} linux.16xlarge.nvidia.gpu: disk_size: 150 instance_type: g3.16xlarge @@ -99,6 +103,8 @@ runner_types: instance_type: c5.24xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} linux.2xlarge: disk_size: 150 instance_type: c5.2xlarge @@ -216,11 +222,15 @@ runner_types: instance_type: t4g.2xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} linux.arm64.m7g.4xlarge.ephemeral: disk_size: 256 instance_type: m7g.4xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} linux.arm64.m7g.metal: disk_size: 256 instance_type: m7g.metal @@ -234,6 +244,8 @@ runner_types: instance_type: g4dn.xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} windows.g4dn.xlarge.nonephemeral: disk_size: 256 instance_type: g4dn.xlarge @@ -247,6 +259,8 @@ runner_types: instance_type: c5d.4xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} windows.4xlarge.nonephemeral: disk_size: 256 instance_type: c5d.4xlarge @@ -260,6 +274,8 @@ runner_types: instance_type: p3.2xlarge is_ephemeral: true os: windows + variants: + ephemeral: {} windows.8xlarge.nvidia.gpu.nonephemeral: disk_size: 256 instance_type: p3.2xlarge @@ -313,3 +329,5 @@ runner_types: instance_type: r5.12xlarge is_ephemeral: true os: linux + variants: + ephemeral: {} diff --git a/.github/scripts/validate_scale_config.py b/.github/scripts/validate_scale_config.py index 68acf2b93e..24f89a8b68 100644 --- a/.github/scripts/validate_scale_config.py +++ b/.github/scripts/validate_scale_config.py @@ -11,7 +11,7 @@ import os import urllib.request from pathlib import Path -from typing import Any, cast, Dict, List, NamedTuple +from typing import Any, cast, Dict, List, NamedTuple, Union import jsonschema # type: ignore[import-untyped] import yaml @@ -139,12 +139,15 @@ def runner_types_are_equivalent( return are_same -def is_config_valid_internally(runner_types: Dict[str, Dict[str, str]]) -> bool: +def is_config_valid_internally( + runner_types: Dict[str, Dict[str, Union[int, str, dict]]], +) -> bool: """ Ensure that for every linux runner type in the config: 1 - they match RunnerTypeScaleConfig https://github.com/pytorch/test-infra/blob/f3c58fea68ec149391570d15a4d0a03bc26fbe4f/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts#L50 2 - they have a max_available of at least 50, or is not enforced + 3 - a ephemeral variant is defined """ invalid_runners = set() @@ -158,6 +161,39 @@ def is_config_valid_internally(runner_types: Dict[str, Dict[str, str]]) -> bool: # so the next part of the code might break continue + # Unecessary validations, that could be a simple onliner, but Code scanning / lintrunner + # is mercerless and will complain about it + if "variants" not in runner_config: + print(f"Runner type {runner_type} does not have a variants section defined") + invalid_runners.add(runner_type) + continue + if not isinstance(runner_config["variants"], dict): + print( + f"Runner type {runner_type} has a variants section that is not a dictionary" + ) + invalid_runners.add(runner_type) + continue + + ephemeral_variant: Union[None, dict] = runner_config["variants"].get( + "ephemeral", None + ) + + if ephemeral_variant is None: + print( + f"Runner type {runner_type} does not have an ephemeral variant defined" + ) + invalid_runners.add(runner_type) + continue + else: + if not ephemeral_variant.get( + "is_ephemeral", False + ) and not runner_config.get("is_ephemeral", False): + print( + f"Runner type {runner_type} has an ephemeral variant that is not ephemeral" + ) + invalid_runners.add(runner_type) + continue + # Ensure that the max_available is at least MAX_AVAILABLE_MINIMUM # this is a requirement as scale-up always keeps at minimum some spare runners live, and less than MAX_AVAILABLE_MINIMUM # will very easily trigger alerts of not enough runners @@ -172,9 +208,17 @@ def is_config_valid_internally(runner_types: Dict[str, Dict[str, str]]) -> bool: "property or set it to a negative value." ) invalid_runners.add(runner_type) + # This validation is absolute not necessary, as it is being validated on the jsonschema + # but it is here to make the code scanner happy + elif not isinstance(runner_config["max_available"], int): + print( + f"Runner type {runner_type} has max_available set to {runner_config['max_available']}, " + "which is not an integer" + ) + invalid_runners.add(runner_type) elif ( - int(runner_config["max_available"]) < MAX_AVAILABLE_MINIMUM - and int(runner_config["max_available"]) >= 0 + runner_config["max_available"] < MAX_AVAILABLE_MINIMUM + and runner_config["max_available"] >= 0 ): print( f"Runner type {runner_type} has max_available set to {runner_config['max_available']}, " diff --git a/.github/workflows/mobile_job.yml b/.github/workflows/mobile_job.yml index 8d70480854..821fcea187 100644 --- a/.github/workflows/mobile_job.yml +++ b/.github/workflows/mobile_job.yml @@ -34,6 +34,11 @@ on: description: The device pool associated with the project default: 'arn:aws:devicefarm:us-west-2::devicepool:082d10e5-d7d7-48a5-ba5c-b33d66efa1f5' type: string + new-output-format-flag: + description: experiment flag to enable the new artifact json format + required: false + default: false + type: boolean # Pulling test-infra itself for device farm runner script test-infra-repository: @@ -310,7 +315,9 @@ jobs: RUN_ID: ${{ github.run_id }} RUN_ATTEMPT: ${{ github.run_attempt }} JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + GIT_JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} WORKING_DIRECTORY: test-infra/tools/device-farm-runner + NEW_OUTPUT_FORMAT_FLAG: ${{ inputs.new-output-format-flag }} uses: nick-fields/retry@v3.0.0 with: shell: bash @@ -331,20 +338,11 @@ jobs: --name-prefix "${JOB_NAME}-${DEVICE_TYPE}" \ --workflow-id "${RUN_ID}" \ --workflow-attempt "${RUN_ATTEMPT}" \ - --output "ios-artifacts-${JOB_ID}.json" + --output "ios-artifacts-${JOB_ID}.json" \ + --git-job-name "${GIT_JOB_NAME}" \ + --new-json-output-format "${NEW_OUTPUT_FORMAT_FLAG}" popd - - name: Upload iOS artifacts to S3 - uses: seemethere/upload-artifact-s3@v5 - if: always() - with: - retention-days: 14 - s3-bucket: gha-artifacts - s3-prefix: | - device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts - path: | - test-infra/tools/device-farm-runner/ios-artifacts-${{ steps.get-job-id.outputs.job-id }}.json - - name: Run Android tests on devices id: android-test if: ${{ inputs.device-type == 'android' }} @@ -361,7 +359,9 @@ jobs: RUN_ID: ${{ github.run_id }} RUN_ATTEMPT: ${{ github.run_attempt }} JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + GIT_JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} WORKING_DIRECTORY: test-infra/tools/device-farm-runner + NEW_OUTPUT_FORMAT_FLAG: ${{ inputs.new-output-format-flag }} uses: nick-fields/retry@v3.0.0 with: shell: bash @@ -382,10 +382,26 @@ jobs: --name-prefix "${JOB_NAME}-${DEVICE_TYPE}" \ --workflow-id "${RUN_ID}" \ --workflow-attempt "${RUN_ATTEMPT}" \ - --output "android-artifacts-${JOB_ID}.json" + --output "android-artifacts-${JOB_ID}.json" \ + --git-job-name "${GIT_JOB_NAME}" \ + --new-json-output-format "${NEW_OUTPUT_FORMAT_FLAG}" popd - - name: Upload Android artifacts to S3 + - name: Check artifacts if any job fails + if: failure() + working-directory: test-infra/tools/device-farm-runner + shell: bash + env: + DEVICE_TYPE: ${{ inputs.device-type }} + BENCHMARK_OUTPUT: ${{ inputs.device-type }}-artifacts-${{ steps.get-job-id.outputs.job-id }}.json + GIT_JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + run: | + if [[ ! -f "$BENCHMARK_OUTPUT" ]]; then + echo "missing artifact json file for ${DEVICE_TYPE} with name ${BENCHMARK_OUTPUT}, generating ... " + echo "{\"git_job_name\": \"$GIT_JOB_NAME\"}" >> "$BENCHMARK_OUTPUT" + fi + + - name: Upload artifacts to S3 uses: seemethere/upload-artifact-s3@v5 if: always() with: @@ -394,4 +410,4 @@ jobs: s3-prefix: | device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts path: | - test-infra/tools/device-farm-runner/android-artifacts-${{ steps.get-job-id.outputs.job-id }}.json + test-infra/tools/device-farm-runner/${{ inputs.device-type }}-artifacts-${{ steps.get-job-id.outputs.job-id }}.json diff --git a/.github/workflows/test_mobile_job.yml b/.github/workflows/test_mobile_job.yml index 1c14c422e1..a99e467154 100644 --- a/.github/workflows/test_mobile_job.yml +++ b/.github/workflows/test_mobile_job.yml @@ -18,7 +18,7 @@ jobs: device-type: ios # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS runner: ubuntu-latest - # There values are prepared beforehand for the test + # These values are prepared beforehand for the test project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:b531574a-fb82-40ae-b687-8f0b81341ae0 device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:b531574a-fb82-40ae-b687-8f0b81341ae0/da5d902d-45db-477b-ae0a-766e06ef3845 ios-ipa-archive: https://ossci-assets.s3.amazonaws.com/DeviceFarm.ipa @@ -34,10 +34,45 @@ jobs: device-type: android runner: ubuntu-latest timeout: 120 - # There values are prepared beforehand for the test + # These values are prepared beforehand for the test project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:b531574a-fb82-40ae-b687-8f0b81341ae0 device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:b531574a-fb82-40ae-b687-8f0b81341ae0/bd86eb80-74a6-4511-8183-09aa66e3ccc4 android-app-archive: https://ossci-assets.s3.amazonaws.com/app-debug.apk android-test-archive: https://ossci-assets.s3.amazonaws.com/app-debug-androidTest.apk test-spec: https://ossci-assets.s3.amazonaws.com/android-llm-device-farm-test-spec.yml extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip + + test-ios-job-with-new-output-flag: + permissions: + id-token: write + contents: read + uses: ./.github/workflows/mobile_job.yml + with: + device-type: ios + # For iOS testing, the runner just needs to call AWS Device Farm, so there is no need to run this on macOS + runner: ubuntu-latest + # These values are prepared beforehand for the test + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:b531574a-fb82-40ae-b687-8f0b81341ae0 + device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:b531574a-fb82-40ae-b687-8f0b81341ae0/da5d902d-45db-477b-ae0a-766e06ef3845 + ios-ipa-archive: https://ossci-assets.s3.amazonaws.com/DeviceFarm.ipa + ios-xctestrun-zip: https://ossci-assets.s3.amazonaws.com/MobileNetClassifierTest_MobileNetClassifierTest_iphoneos17.4-arm64.xctestrun.zip + test-spec: https://ossci-assets.s3.amazonaws.com/default-ios-device-farm-appium-test-spec.yml + new-output-format-flag: true + + test-android-llama2-job-with-new-output-flag: + permissions: + id-token: write + contents: read + uses: ./.github/workflows/mobile_job.yml + with: + device-type: android + runner: ubuntu-latest + timeout: 120 + # These values are prepared beforehand for the test + project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:b531574a-fb82-40ae-b687-8f0b81341ae0 + device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:b531574a-fb82-40ae-b687-8f0b81341ae0/bd86eb80-74a6-4511-8183-09aa66e3ccc4 + android-app-archive: https://ossci-assets.s3.amazonaws.com/app-debug.apk + android-test-archive: https://ossci-assets.s3.amazonaws.com/app-debug-androidTest.apk + test-spec: https://ossci-assets.s3.amazonaws.com/android-llm-device-farm-test-spec.yml + extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip + new-output-format-flag: true diff --git a/clickhouse_db_schema/oss_ci_queue_time_histogram/schema.sql b/clickhouse_db_schema/oss_ci_queue_time_histogram/schema.sql new file mode 100644 index 0000000000..5532197b0e --- /dev/null +++ b/clickhouse_db_schema/oss_ci_queue_time_histogram/schema.sql @@ -0,0 +1,27 @@ +CREATE TABLE misc.oss_ci_queue_time_histogram( + `created_at` DateTime64(0, 'UTC'), + `time_stamp` DateTime64(0, 'UTC'), + `type` String, + `repo` String DEFAULT 'pytorch/pytorch', + `workflow_name` String, + `job_name` String, + `machine_type` String, + `histogram_version` String, + `histogram` Array(UInt64), + `max_queue_time` UInt64, + `total_count` UInt64, + `extra_info` Map(String,String) +) +ENGINE = SharedMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}') +PARTITION BY toYYYYMM(created_at) +ORDER BY ( + job_name, + workflow_name, + machine_type, + job_name, + time_stamp, + repo, + type, +) +TTL toDate(time_stamp) + toIntervalYear(5) +SETTINGS index_granularity = 8192 diff --git a/terraform-aws-github-runner/main.tf b/terraform-aws-github-runner/main.tf index 094389bb06..eb602fc670 100644 --- a/terraform-aws-github-runner/main.tf +++ b/terraform-aws-github-runner/main.tf @@ -104,6 +104,7 @@ module "runners" { environment = var.environment tags = local.tags + scale_config_org = var.scale_config_org scale_config_repo = var.scale_config_repo scale_config_repo_path = var.scale_config_repo_path @@ -112,6 +113,8 @@ module "runners" { encrypt = var.encrypt_secrets } + retry_scale_up_chron_hud_query_url = var.retry_scale_up_chron_hud_query_url + must_have_issues_labels = var.must_have_issues_labels cant_have_issues_labels = var.cant_have_issues_labels diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js b/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js index 5902525a76..58cf2e6388 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js @@ -11,5 +11,8 @@ module.exports = { lines: 93, statements: 94 } - } + }, + moduleNameMapper: { + axios: 'axios/dist/node/axios.cjs', // Allow axios to work in tests + }, }; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json b/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json index 6e5217a9ef..27976ad2c7 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json @@ -42,6 +42,7 @@ "@types/uuid": "^9.0.1", "async-mutex": "^0.4.0", "aws-sdk": "^2.863.0", + "axios": "^1.7.7", "cron-parser": "^3.3.0", "generic-pool": "^3.9.0", "lru-cache": "^6.0.0", diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts index 74ae4ac897..301ae1f995 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.test.ts @@ -1,4 +1,4 @@ -import { scaleDown as scaleDownL, scaleUp as scaleUpL } from './lambda'; +import { scaleDown as scaleDownL, scaleUp as scaleUpL, scaleUpChron as scaleUpChronL } from './lambda'; import nock from 'nock'; import { Config } from './scale-runners/config'; @@ -6,6 +6,7 @@ import { Context, SQSEvent, ScheduledEvent } from 'aws-lambda'; import { mocked } from 'ts-jest/utils'; import { scaleDown } from './scale-runners/scale-down'; import { scaleUp, RetryableScalingError } from './scale-runners/scale-up'; +import { scaleUpChron } from './scale-runners/scale-up-chron'; import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs'; import * as MetricsModule from './scale-runners/metrics'; @@ -21,8 +22,10 @@ jest.mock('aws-sdk', () => ({ jest.mock('./scale-runners/scale-down'); jest.mock('./scale-runners/scale-up'); jest.mock('./scale-runners/sqs'); +jest.mock('./scale-runners/scale-up-chron'); -const metrics = new MetricsModule.ScaleUpMetrics(); +const mockScaleUpMetrics = new MetricsModule.ScaleUpMetrics(); +const mockScaleUpChronMetrics = new MetricsModule.ScaleUpChronMetrics(); beforeEach(() => { jest.resetModules(); @@ -34,7 +37,7 @@ beforeEach(() => { describe('scaleUp', () => { beforeEach(() => { jest.spyOn(global.Math, 'random').mockReturnValue(1.0); - jest.spyOn(MetricsModule, 'ScaleUpMetrics').mockReturnValue(metrics); + jest.spyOn(MetricsModule, 'ScaleUpMetrics').mockReturnValue(mockScaleUpMetrics); }); afterEach(() => { @@ -55,8 +58,8 @@ describe('scaleUp', () => { callback, ); expect(mockedScaleUp).toBeCalledTimes(2); - expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, metrics); - expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 2 }, metrics); + expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, mockScaleUpMetrics); + expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 2 }, mockScaleUpMetrics); expect(callback).toBeCalledTimes(1); expect(callback).toBeCalledWith(null); }); @@ -88,12 +91,12 @@ describe('scaleUp', () => { callback, ); expect(mockedScaleUp).toBeCalledTimes(1); - expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, metrics); + expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, mockScaleUpMetrics); expect(callback).toBeCalledTimes(1); expect(callback).toBeCalledWith('Failed handling SQS event'); expect(sqsDeleteMessageBatch).toBeCalledTimes(1); - expect(sqsDeleteMessageBatch).toBeCalledWith(metrics, evts); + expect(sqsDeleteMessageBatch).toBeCalledWith(mockScaleUpMetrics, evts); }); it('stochasticOvershoot when retryCount > 5', async () => { @@ -137,7 +140,7 @@ describe('scaleUp', () => { }, ]; expect(sqsSendMessages).toBeCalledTimes(1); - expect(sqsSendMessages).toBeCalledWith(metrics, expected, 'asdf'); + expect(sqsSendMessages).toBeCalledWith(mockScaleUpMetrics, expected, 'asdf'); expect(sqsDeleteMessageBatch).toBeCalledTimes(0); }); @@ -205,10 +208,10 @@ describe('scaleUp', () => { }, ]; expect(sqsSendMessages).toBeCalledTimes(1); - expect(sqsSendMessages).toBeCalledWith(metrics, expected, 'asdf'); + expect(sqsSendMessages).toBeCalledWith(mockScaleUpMetrics, expected, 'asdf'); expect(sqsDeleteMessageBatch).toBeCalledTimes(1); - expect(sqsDeleteMessageBatch).toBeCalledWith(metrics, records); + expect(sqsDeleteMessageBatch).toBeCalledWith(mockScaleUpMetrics, records); }); }); @@ -231,3 +234,28 @@ describe('scaleDown', () => { expect(callback).toBeCalledWith('Failed'); }); }); + +describe('scaleUpChron', () => { + beforeEach(() => { + jest.spyOn(MetricsModule, 'ScaleUpChronMetrics').mockReturnValue(mockScaleUpChronMetrics); + }); + + it('succeeds', async () => { + const mockedScaleUpChron = mocked(scaleUpChron).mockResolvedValue(undefined); + const callback = jest.fn(); + await scaleUpChronL({} as unknown as ScheduledEvent, {} as unknown as Context, callback); + expect(mockedScaleUpChron).toBeCalledTimes(1); + expect(mockedScaleUpChron).toBeCalledWith(mockScaleUpChronMetrics); + expect(callback).toBeCalledTimes(1); + expect(callback).toBeCalledWith(null); + }); + + it('fails', async () => { + const mockedScaleUpChron = mocked(scaleUpChron).mockRejectedValue(Error('error')); + const callback = jest.fn(); + await scaleUpChronL({} as unknown as ScheduledEvent, {} as unknown as Context, callback); + expect(mockedScaleUpChron).toBeCalledTimes(1); + expect(callback).toBeCalledTimes(1); + expect(callback).toBeCalledWith('Failed'); + }); +}); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts index acbb6e52b9..b786d37e98 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts @@ -2,9 +2,15 @@ import { ActionRequestMessage, RetryableScalingError, scaleUp as scaleUpR } from import { Context, SQSEvent, SQSRecord, ScheduledEvent } from 'aws-lambda'; import { Config } from './scale-runners/config'; -import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics'; +import { + ScaleUpMetrics, + ScaleUpChronMetrics, + sendMetricsAtTimeout, + sendMetricsTimeoutVars, +} from './scale-runners/metrics'; import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils'; import { scaleDown as scaleDownR } from './scale-runners/scale-down'; +import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron'; import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs'; async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) { @@ -155,3 +161,35 @@ export async function scaleDown(event: ScheduledEvent, context: Context, callbac return callback('Failed'); } } + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export async function scaleUpChron(event: ScheduledEvent, context: Context, callback: any) { + // we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent + context.callbackWaitsForEmptyEventLoop = false; + + const metrics = new ScaleUpChronMetrics(); + const sndMetricsTimout: sendMetricsTimeoutVars = { + metrics: metrics, + }; + sndMetricsTimout.setTimeout = setTimeout( + sendMetricsAtTimeout(sndMetricsTimout), + (Config.Instance.lambdaTimeout - 10) * 1000, + ); + + try { + await scaleUpChronR(metrics); + return callback(null); + } catch (e) { + console.error(e); + return callback('Failed'); + } finally { + try { + clearTimeout(sndMetricsTimout.setTimeout); + sndMetricsTimout.metrics = undefined; + sndMetricsTimout.setTimeout = undefined; + await metrics.sendMetrics(); + } catch (e) { + console.error(`Error sending metrics: ${e}`); + } + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts index 0755e86970..df41652707 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts @@ -36,8 +36,11 @@ export class Config { readonly retryScaleUpRecordQueueUrl: string | undefined; readonly runnerGroupName: string | undefined; readonly runnersExtraLabels: undefined | string; + readonly scaleConfigOrg: string; readonly scaleConfigRepo: string; readonly scaleConfigRepoPath: string; + readonly scaleUpMinQueueTimeMinutes: number; + readonly scaleUpChronRecordQueueUrl: string | undefined; readonly secretsManagerSecretsId: string | undefined; readonly sSMParamCleanupAgeDays: number; readonly sSMParamMaxCleanupAllowance: number; @@ -93,9 +96,14 @@ export class Config { this.retryScaleUpRecordDelayS = Number(process.env.RETRY_SCALE_UP_RECORD_DELAY_S || '0'); /* istanbul ignore next */ this.retryScaleUpRecordJitterPct = Number(process.env.RETRY_SCALE_UP_RECORD_JITTER_PCT || '0'); - this.retryScaleUpRecordQueueUrl = process.env.RETRY_SCALE_UP_RECORD_QUEUE_URL; + this.retryScaleUpRecordQueueUrl = process.env.RETRY_SCALE_UP_CHRON_RECORD_QUEUE_URL; + this.scaleUpChronRecordQueueUrl = process.env.SCALE_UP_CHRON_HUD_QUERY_URL; + this.scaleUpMinQueueTimeMinutes = process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES + ? Number(process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES) + : 30; this.runnerGroupName = process.env.RUNNER_GROUP_NAME; this.runnersExtraLabels = process.env.RUNNER_EXTRA_LABELS; + this.scaleConfigOrg = process.env.SCALE_CONFIG_ORG || ''; /* istanbul ignore next */ this.scaleConfigRepo = process.env.SCALE_CONFIG_REPO || ''; if (this.enableOrganizationRunners && !this.scaleConfigRepo) { diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.test.ts index dcf7285fd3..c686a0a4d2 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.test.ts @@ -695,6 +695,15 @@ runner_types: variants: ephemeral: is_ephemeral: true + c.linux.4xlarge: + instance_type: c5.2xlarge + os: linux + min_available: 1 + max_available: 1 + disk_size: 150 + is_ephemeral: false + variants: + dummyvariant: {} lf.c.linux.4xlarge: instance_type: c5.2xlarge os: linux @@ -702,6 +711,7 @@ runner_types: disk_size: 150 is_ephemeral: false variants: + dummyvariant: {} ephemeral: is_ephemeral: true`; @@ -766,6 +776,30 @@ runner_types: ami: 'ami-123', }, ], + [ + 'c.dummyvariant.linux.4xlarge', + { + runnerTypeName: 'c.dummyvariant.linux.4xlarge', + instance_type: 'c5.2xlarge', + os: 'linux', + min_available: 1, + max_available: 1, + disk_size: 150, + is_ephemeral: false, + }, + ], + [ + 'c.linux.4xlarge', + { + runnerTypeName: 'c.linux.4xlarge', + instance_type: 'c5.2xlarge', + os: 'linux', + min_available: 1, + max_available: 1, + disk_size: 150, + is_ephemeral: false, + }, + ], [ 'lf.linux.4xlarge', { @@ -801,6 +835,17 @@ runner_types: is_ephemeral: false, }, ], + [ + 'lf.c.dummyvariant.linux.4xlarge', + { + runnerTypeName: 'lf.c.dummyvariant.linux.4xlarge', + instance_type: 'c5.2xlarge', + os: 'linux', + min_available: 1, + disk_size: 150, + is_ephemeral: false, + }, + ], [ 'lf.c.ephemeral.linux.4xlarge', { diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.ts index 627b34ad02..844aa8e997 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/gh-runners.ts @@ -377,26 +377,26 @@ export async function getRunnerTypes( return; } - if (runnerType.variants.size > 0) { - Array.from(runnerType.variants.keys()).forEach((variant) => { - const variantType = runnerType.variants?.get(variant); - /* istanbul ignore next */ - if (!variantType) { - return; - } - - let variantRunnTypeName: string; - if (key.startsWith('lf.c.')) { - variantRunnTypeName = `lf.c.${variant}.${key.slice(5)}`; - } else if (key.startsWith('lf.')) { - variantRunnTypeName = `lf.${variant}.${key.slice(3)}`; - } else { - variantRunnTypeName = `${variant}.${key}`; - } - - result.set(variantRunnTypeName, { ...runnerType, ...variantType, runnerTypeName: variantRunnTypeName }); - }); - } + Array.from(runnerType.variants.keys()).forEach((variant) => { + const variantType = runnerType.variants?.get(variant); + /* istanbul ignore next */ + if (!variantType) { + return; + } + + let variantRunnTypeName: string; + if (key.startsWith('lf.c.')) { + variantRunnTypeName = `lf.c.${variant}.${key.slice(5)}`; + } else if (key.startsWith('lf.')) { + variantRunnTypeName = `lf.${variant}.${key.slice(3)}`; + } else if (key.startsWith('c.')) { + variantRunnTypeName = `c.${variant}.${key.slice(2)}`; + } else { + variantRunnTypeName = `${variant}.${key}`; + } + + result.set(variantRunnTypeName, { ...runnerType, ...variantType, runnerTypeName: variantRunnTypeName }); + }); }); const filteredResult: Map = new Map( diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts index da6ace295d..6903a3bb72 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts @@ -913,8 +913,8 @@ export class Metrics { } export class ScaleUpMetrics extends Metrics { - constructor() { - super('scaleUp'); + constructor(lambdaName: string | undefined = undefined) { + super(lambdaName || 'scaleUp'); } /* istanbul ignore next */ @@ -1630,6 +1630,64 @@ export class ScaleDownMetrics extends Metrics { } } +export class ScaleUpChronMetrics extends ScaleUpMetrics { + constructor() { + super('scaleUpChron'); + } + + queuedRunnerStats(org: string, runnerType: string, numQueuedJobs: number) { + const dimensions = new Map([ + ['Org', org], + ['RunnerType', runnerType], + ['numQueuedJobs', numQueuedJobs.toString()], + ]); + this.addEntry('gh.scaleupchron.queuedRunners', 3, dimensions); + } + + queuedRunnerFailure(error: string) { + const dimensions = new Map([['error', error]]); + this.countEntry('gh.scaleupchron.queuedRunners.failure', 1, dimensions); + } + + /* istanbul ignore next */ + getQueuedJobsEndpointSuccess(ms: number) { + this.countEntry(`gh.calls.total`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.success`, 1); + this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms); + } + + /* istanbul ignore next */ + getQueuedJobsEndpointFailure(ms: number) { + this.countEntry(`gh.calls.total`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.failure`, 1); + this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms); + } + + scaleUpInstanceSuccess() { + this.scaleUpSuccess(); + this.countEntry('run.scaleupchron.success'); + } + + scaleUpInstanceFailureNonRetryable(error: string) { + const dimensions = new Map([['error', error]]); + // should we add more information about this or do we not care since it'll be requeued? + this.countEntry('run.scaleupchron.failure.nonRetryable', 1, dimensions); + } + + scaleUpInstanceFailureRetryable(error: string) { + const dimensions = new Map([['error', error]]); + + // should we add more information about this or do we not care since it'll be requeued? + this.countEntry('run.scaleupchron.failure.retryable', 1, dimensions); + } + + scaleUpInstanceNoOp() { + this.countEntry('run.scaleupchron.noop'); + } +} + export interface sendMetricsTimeoutVars { metrics?: Metrics; setTimeout?: ReturnType; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts new file mode 100644 index 0000000000..6af697c838 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -0,0 +1,223 @@ +import { Config } from './config'; +import { mocked } from 'ts-jest/utils'; +import { getRepo, expBackOff, shuffleArrayInPlace } from './utils'; +import { getRunnerTypes } from './gh-runners'; + +// import * as ScaleUpChronModule from './scale-up-chron'; +import { scaleUpChron, getQueuedJobs } from './scale-up-chron'; +import { scaleUp } from './scale-up'; + +import * as MetricsModule from './metrics'; +import { RunnerType } from './runners'; +import nock from 'nock'; + +jest.mock('./runners'); +jest.mock('./gh-runners'); +jest.mock('./gh-issues'); +jest.mock('./utils'); +jest.mock('axios'); +jest.mock('./scale-up'); + +const hudQueryValidResponse = [ + { + runner_label: 'test_runner_type1', + org: 'test_org1', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 31, + max_queue_time_minutes: 31, + }, + { + runner_label: 'test_runner_type2', + org: 'test_org2', + repo: 'test_repo2', + num_queued_jobs: 2, + min_queue_time_minutes: 32, + max_queue_time_minutes: 32, + }, +]; +const hudQueryInvalidRunnerLabelResponse = [ + { + runner_label: 'label1-nomatch', + org: 'test_org1', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 31, + max_queue_time_minutes: 31, + }, + { + runner_label: 'test_runner_type2', + org: 'test_org2', + repo: 'test_repo2', + num_queued_jobs: 2, + min_queue_time_minutes: 32, + max_queue_time_minutes: 32, + }, +]; +const hudQueryInvalidOrgResponse = [ + { + runner_label: 'label1', + org: 'test_org1-nomatch', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 31, + max_queue_time_minutes: 31, + }, + { + runner_label: 'test_runner_type2', + org: 'test_org2', + repo: 'test_repo2', + num_queued_jobs: 2, + min_queue_time_minutes: 32, + max_queue_time_minutes: 32, + }, +]; + +const runnerTypeValid = 'test_runner_type1'; +const runnerTypeInvalid = 'runner_type_invalid'; + +const baseCfg = { + scaleConfigOrg: 'test_org1', + scaleUpMinQueueTimeMinutes: 30, + scaleUpChronRecordQueueUrl: 'url', +} as unknown as Config; + +const metrics = new MetricsModule.ScaleUpChronMetrics(); +beforeEach(() => { + jest.resetModules(); + jest.clearAllMocks(); + jest.restoreAllMocks(); + + nock.disableNetConnect(); +}); + +describe('scaleUpChron', () => { + it('invalid scaleUpChronRecordQueueUrl', async () => { + const scaleUpChron = jest.requireActual('./scale-up-chron').scaleUpChron; + + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation( + () => + ({ + ...baseCfg, + scaleUpChronRecordQueueUrl: null, + } as unknown as Config), + ); + + mocked(getRepo).mockReturnValue({ owner: 'owner', repo: 'repo' }); + mocked(getRunnerTypes).mockResolvedValue(new Map([[runnerTypeValid, { is_ephemeral: false } as RunnerType]])); + + await expect(scaleUpChron(metrics)).rejects.toThrow( + new Error('scaleUpChronRecordQueueUrl is not set. Cannot send queued scale up requests'), + ); + }); + + it('queued jobs do not match available runners', async () => { + const scaleUpInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + + mocked(getRepo).mockReturnValue({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(getRunnerTypes).mockResolvedValue(new Map([[runnerTypeInvalid, { is_ephemeral: false } as RunnerType]])); + mocked(expBackOff).mockResolvedValue({ data: hudQueryInvalidRunnerLabelResponse }); + + await scaleUpChron(metrics); + expect(scaleUpInstanceNoOpSpy).toBeCalledTimes(1); + }); + + it('queued jobs do not match scale config org', async () => { + const scaleUpInstanceNoOp = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + + mocked(getRepo).mockReturnValue({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(expBackOff).mockResolvedValue({ data: hudQueryInvalidOrgResponse }); + mocked(getRunnerTypes).mockResolvedValue(new Map([[runnerTypeInvalid, { is_ephemeral: false } as RunnerType]])); + + await scaleUpChron(metrics); + expect(scaleUpInstanceNoOp).toBeCalledTimes(1); + }); + + it('queued jobs match available runners and scale config org and scaled up completes', async () => { + const mockedScaleUp = mocked(scaleUp).mockResolvedValue(undefined); + const scaleUpInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + + mocked(shuffleArrayInPlace).mockReturnValue([hudQueryValidResponse]); + mocked(getRepo).mockReturnValue({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(getRunnerTypes).mockResolvedValue( + new Map([[runnerTypeValid, { runnerTypeName: 'test_runner_type1' } as RunnerType]]), + ); + mocked(expBackOff).mockResolvedValue({ data: hudQueryValidResponse }); + + await scaleUpChron(metrics); + expect(scaleUpInstanceNoOpSpy).toBeCalledTimes(0); + expect(mockedScaleUp).toBeCalledTimes(1); + }); + + it('scaled up throws error', async () => { + const mockedScaleUp = mocked(scaleUp).mockRejectedValue(Error('error')); + const scaleUpInstanceFailureRetryableSpy = jest.spyOn(metrics, 'scaleUpInstanceFailureRetryable'); + + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + + mocked(shuffleArrayInPlace).mockReturnValue([hudQueryValidResponse]); + mocked(getRepo).mockReturnValue({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(getRunnerTypes).mockResolvedValue( + new Map([[runnerTypeValid, { runnerTypeName: 'test_runner_type1' } as RunnerType]]), + ); + mocked(expBackOff).mockResolvedValue({ data: hudQueryValidResponse }); + + await scaleUpChron(metrics); + expect(scaleUpInstanceFailureRetryableSpy).toBeCalledTimes(1); + expect(mockedScaleUp).toBeCalledTimes(1); + }); +}); + +describe('getQueuedJobs', () => { + it('get queue data from url request with valid response', async () => { + mocked(expBackOff).mockResolvedValue({ data: hudQueryValidResponse }); + + expect(await getQueuedJobs(metrics, 'url')).toEqual([ + { + runner_label: 'test_runner_type1', + org: 'test_org1', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 31, + max_queue_time_minutes: 31, + }, + { + runner_label: 'test_runner_type2', + org: 'test_org2', + repo: 'test_repo2', + num_queued_jobs: 2, + min_queue_time_minutes: 32, + max_queue_time_minutes: 32, + }, + ]); + }); + + it('get queue data from url request with invalid response', async () => { + mocked(expBackOff).mockImplementation(() => { + throw new Error('Throwing a fake error!'); + }); + + expect(await getQueuedJobs(metrics, 'url')).toEqual([]); + }); + + it('get queue data from url request with empty response', async () => { + const errorResponse = ''; + const queuedRunnerFailureSpy = jest.spyOn(metrics, 'queuedRunnerFailure'); + + mocked(expBackOff).mockResolvedValue({ data: errorResponse }); + + expect(await getQueuedJobs(metrics, 'url')).toEqual([]); + expect(queuedRunnerFailureSpy).toBeCalledTimes(1); + }); +}); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts new file mode 100644 index 0000000000..a0a5d1bddf --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -0,0 +1,120 @@ +import axios, { AxiosResponse } from 'axios'; + +import { Config } from './config'; +import { getRepo, shuffleArrayInPlace, expBackOff } from './utils'; +import { ScaleUpChronMetrics } from './metrics'; +import { getRunnerTypes } from './gh-runners'; +import { ActionRequestMessage, scaleUp } from './scale-up'; + +export async function scaleUpChron(metrics: ScaleUpChronMetrics): Promise { + // This function does the following: + // 1. Queries for queued runners via HUD + // 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and + // are ephemeral and nonephemeral + // 3. For each runner queued for longer than the minimum delay, try to scale it up + + const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo); + + const validRunnerTypes = await getRunnerTypes(scaleConfigRepo, metrics, Config.Instance.scaleConfigRepoPath); + + const minAutoScaleupDelayMinutes = Config.Instance.scaleUpMinQueueTimeMinutes; + if (!Config.Instance.scaleUpChronRecordQueueUrl) { + metrics.scaleUpInstanceFailureNonRetryable( + 'scaleUpChronRecordQueueUrl is not set. Cannot send queued scale up requests', + ); + throw new Error('scaleUpChronRecordQueueUrl is not set. Cannot send queued scale up requests'); + } + const scaleUpChronRecordQueueUrl = Config.Instance.scaleUpChronRecordQueueUrl; + // Only proactively scale up the jobs that have been queued for longer than normal + // Filter out the queued jobs that are do not correspond to a valid runner type + const queuedJobs = (await getQueuedJobs(metrics, scaleUpChronRecordQueueUrl)) + .filter((runner) => { + return ( + runner.min_queue_time_minutes >= minAutoScaleupDelayMinutes && runner.org === Config.Instance.scaleConfigOrg + ); + }) + .filter((requested_runner) => { + return Array.from(validRunnerTypes.keys()).some((available_runner_label) => { + return available_runner_label === requested_runner.runner_label; + }); + }); + + if (queuedJobs.length === 0) { + metrics.scaleUpInstanceNoOp(); + return; + } + + // Send a message to the SQS queue to scale up the runners + const scaleUpRequests: Array = queuedJobs.flatMap((runner) => { + return new Array(runner.num_queued_jobs).fill({ + id: Math.floor(Math.random() * 100000000000000), + eventType: 'workflow_job', + repositoryName: runner.repo, + repositoryOwner: runner.org, + runnerLabels: [runner.runner_label], + }); + }); + + for (const request of shuffleArrayInPlace(scaleUpRequests)) { + try { + await scaleUp('aws:sqs', request, metrics); + metrics.scaleUpInstanceSuccess(); + } catch (error) { + metrics.scaleUpInstanceFailureRetryable((error as Error).message); + } + } +} + +interface QueuedJobsForRunner { + runner_label: string; + org: string; + repo: string; + num_queued_jobs: number; + min_queue_time_minutes: number; + max_queue_time_minutes: number; +} + +export async function getQueuedJobs( + metrics: ScaleUpChronMetrics, + scaleUpChronRecordQueueUrl: string, +): Promise { + // This function queries the HUD for queued runners + // and returns a list of them + + const url = scaleUpChronRecordQueueUrl; + + try { + const response = await expBackOff(() => { + return metrics.trackRequest(metrics.getQueuedJobsEndpointSuccess, metrics.getQueuedJobsEndpointFailure, () => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return axios.get>(url); + }); + }); + + // Map the response to the class + if (response && response.data) { + // check if data is valid as QueuedJobsForRunner[] + if (response.data && Array.isArray(response.data)) { + return response.data.filter( + (runner) => + runner.runner_label && + runner.org && + runner.repo && + typeof runner.num_queued_jobs == 'number' && + runner.num_queued_jobs > 0 && + typeof runner.min_queue_time_minutes == 'number' && + typeof runner.max_queue_time_minutes == 'number', + ); + } else { + /* istanbul ignore next */ + throw new Error(`Invalid data returned from axios get request with url: ${url} - Not an array`); + } + } else { + throw new Error(`No data returned from axios get request with url: ${url}`); + } + } catch (error) { + metrics.queuedRunnerFailure((error as Error).message); + console.error('Error fetching queued runners:', error); + return []; + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml new file mode 100644 index 0000000000..fda647cd9e --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml @@ -0,0 +1,13 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: 'AWS::Serverless-2016-10-31' +Resources: + ScaleUpChronFunction: + Type: 'AWS::Serverless::Function' + Properties: + Handler: index.scaleUpChron + Runtime: nodejs20.x + Events: + ScheduledEvent: + Type: Schedule + Properties: + Schedule: 'rate(1 minute)' \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock b/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock index 7930c0018f..6ed2cc9b2c 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock @@ -1381,6 +1381,15 @@ aws-sdk@^2.863.0: uuid "8.0.0" xml2js "0.6.2" +axios@^1.7.7: + version "1.7.7" + resolved "https://registry.yarnpkg.com/axios/-/axios-1.7.7.tgz#2f554296f9892a72ac8d8e4c5b79c14a91d0a47f" + integrity sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q== + dependencies: + follow-redirects "^1.15.6" + form-data "^4.0.0" + proxy-from-env "^1.1.0" + babel-jest@^26.6.3: version "26.6.3" resolved "https://registry.yarnpkg.com/babel-jest/-/babel-jest-26.6.3.tgz#d87d25cb0037577a0c89f82e5755c5d293c01056" @@ -2326,6 +2335,11 @@ flatted@^3.2.9: resolved "https://registry.yarnpkg.com/flatted/-/flatted-3.3.1.tgz#21db470729a6734d4997002f439cb308987f567a" integrity sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw== +follow-redirects@^1.15.6: + version "1.15.9" + resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.15.9.tgz#a604fa10e443bf98ca94228d9eebcc2e8a2c8ee1" + integrity sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ== + for-each@^0.3.3: version "0.3.3" resolved "https://registry.yarnpkg.com/for-each/-/for-each-0.3.3.tgz#69b447e88a0a5d32c3e7084f3f1710034b21376e" @@ -2347,6 +2361,15 @@ form-data@^3.0.0: combined-stream "^1.0.8" mime-types "^2.1.12" +form-data@^4.0.0: + version "4.0.1" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.1.tgz#ba1076daaaa5bfd7e99c1a6cb02aa0a5cff90d48" + integrity sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + mime-types "^2.1.12" + fragment-cache@^0.2.1: version "0.2.1" resolved "https://registry.yarnpkg.com/fragment-cache/-/fragment-cache-0.2.1.tgz#4290fad27f13e89be7f33799c6bc5a0abfff0d19" @@ -4028,6 +4051,11 @@ propagate@^2.0.0: resolved "https://registry.yarnpkg.com/propagate/-/propagate-2.0.1.tgz#40cdedab18085c792334e64f0ac17256d38f9a45" integrity sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag== +proxy-from-env@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2" + integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg== + psl@^1.1.33: version "1.9.0" resolved "https://registry.yarnpkg.com/psl/-/psl-1.9.0.tgz#d0df2a137f00794565fcaf3b2c00cd09f8d5a5a7" diff --git a/terraform-aws-github-runner/modules/runners/policies/lambda-scale-up-chron.json b/terraform-aws-github-runner/modules/runners/policies/lambda-scale-up-chron.json new file mode 100644 index 0000000000..b5d88076b9 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/policies/lambda-scale-up-chron.json @@ -0,0 +1,43 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeInstances", + "ec2:DescribeTags", + "ec2:RunInstances", + "ec2:CreateNetworkInterface", + "ec2:DescribeNetworkInterfaces", + "ec2:DeleteNetworkInterface", + "ec2:DescribeImages", + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:CreateReplaceRootVolumeTask" + ], + "Resource": ["*"] + }, + { + "Effect": "Allow", + "Action": [ + "ec2:CreateTags" + ], + "Resource": ["*"], + "Condition": { + "StringEquals": { + "ec2:CreateAction" : "RunInstances" + } + } + }, + { + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "${arn_runner_instance_role}" + }, + { + "Effect": "Allow", + "Action": ["ssm:PutParameter"], + "Resource": "*" + } + ] + } diff --git a/terraform-aws-github-runner/modules/runners/scale-down.tf b/terraform-aws-github-runner/modules/runners/scale-down.tf index 9daa764b2e..6398555085 100644 --- a/terraform-aws-github-runner/modules/runners/scale-down.tf +++ b/terraform-aws-github-runner/modules/runners/scale-down.tf @@ -53,6 +53,7 @@ resource "aws_lambda_function" "scale_down" { MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes REDIS_ENDPOINT = var.redis_endpoint REDIS_LOGIN = var.redis_login + SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path SCALE_DOWN_CONFIG = jsonencode(var.idle_config) diff --git a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf new file mode 100644 index 0000000000..52dfbf288c --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf @@ -0,0 +1,172 @@ +resource "aws_kms_grant" "scale_up_chron" { + count = var.encryption.encrypt ? (var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0) : 0 + name = "${var.environment}-scale-up-chron" + key_id = var.encryption.kms_key_id + grantee_principal = aws_iam_role.scale_up_chron[0].arn + operations = ["Decrypt"] + + constraints { + encryption_context_equals = { + Environment = var.environment + } + } +} + +resource "aws_lambda_function" "scale_up_chron" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null + s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null + s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null + filename = var.lambda_s3_bucket == null ? local.lambda_zip : null + source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(local.lambda_zip) : null + function_name = "${var.environment}-scale-up-chron" + role = aws_iam_role.scale_up_chron[0].arn + handler = "index.scaleUpChron" + runtime = "nodejs20.x" + timeout = var.lambda_timeout_scale_up_chron + tags = local.tags + memory_size = 2048 + + environment { + variables = { + AWS_REGION_INSTANCES = join(",", var.aws_region_instances) + DATETIME_DEPLOY = local.datetime_deploy + ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners + ENVIRONMENT = var.environment + GHES_URL = var.ghes_url + GITHUB_APP_CLIENT_ID = var.github_app.client_id + GITHUB_APP_CLIENT_SECRET = var.github_app_client_secret + GITHUB_APP_ID = var.github_app.id + GITHUB_APP_KEY_BASE64 = var.github_app_key_base64 + KMS_KEY_ID = var.encryption.kms_key_id + LAMBDA_TIMEOUT = var.lambda_timeout_scale_up_chron + MIN_AVAILABLE_RUNNERS = var.min_available_runners + MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes + REDIS_ENDPOINT = var.redis_endpoint + REDIS_LOGIN = var.redis_login + SCALE_CONFIG_ORG = var.scale_config_org + SCALE_CONFIG_REPO = var.scale_config_repo + SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path + SCALE_UP_MIN_QUEUE_TIME_MINUTES = 30 + SCALE_UP_CHRON_HUD_QUERY_URL = var.retry_scale_up_chron_hud_query_url + scale_up_chron_CONFIG = jsonencode(var.idle_config) + SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id + AWS_REGIONS_TO_VPC_IDS = join( + ",", + sort(distinct([ + for region_vpc in var.vpc_ids : + format("%s|%s", region_vpc.region, region_vpc.vpc) + ])) + ) + VPC_ID_TO_SECURITY_GROUP_IDS = join( + ",", + sort(distinct(concat( + [ + for vpc in var.vpc_ids : + format( + "%s|%s", + vpc.vpc, + var.runners_security_group_ids[local.vpc_id_to_idx[vpc.vpc]] + ) + ], + [ + for vpc_subnet in var.vpc_sgs : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.sg) + ] + ))) + ) + VPC_ID_TO_SUBNET_IDS = join( + ",", + sort(distinct([ + for vpc_subnet in var.subnet_vpc_ids : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.subnet) + ])) + ) + SUBNET_ID_TO_AZ = join( + ",", + sort(distinct([ + for subnet_az in var.subnet_azs : + format("%s|%s", subnet_az.subnet, subnet_az.az) + ])) + ) + } + } + + vpc_config { + security_group_ids = concat( + var.lambda_security_group_ids, + [var.runners_security_group_ids[0]] + ) + subnet_ids = var.lambda_subnet_ids + } +} + +resource "aws_cloudwatch_log_group" "scale_up_chron" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + name = "/aws/lambda/${aws_lambda_function.scale_up_chron[0].function_name}" + retention_in_days = var.logging_retention_in_days + tags = var.tags +} + +resource "aws_cloudwatch_event_rule" "scale_up_chron" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + name = "${var.environment}-scale-up-chron-rule" + schedule_expression = var.scale_up_chron_schedule_expression + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "scale_up_chron" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + rule = aws_cloudwatch_event_rule.scale_up_chron[0].name + arn = aws_lambda_function.scale_up_chron[0].arn +} + +resource "aws_lambda_permission" "scale_up_chron" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.scale_up_chron[0].function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.scale_up_chron[0].arn +} + +resource "aws_iam_role" "scale_up_chron" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + name = "${var.environment}-action-scale-up-chron-lambda-role" + assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json + path = local.role_path + permissions_boundary = var.role_permissions_boundary + tags = local.tags +} + +resource "aws_iam_role_policy" "scale_up_chron" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + name = "${var.environment}-lambda-scale-up-chron-policy" + role = aws_iam_role.scale_up_chron[0].name + policy = templatefile("${path.module}/policies/lambda-scale-up-chron.json", { + arn_runner_instance_role = var.role_runner_arn + }) +} + +resource "aws_iam_role_policy" "scale_up_chron_logging" { + count = var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0 + name = "${var.environment}-lambda-logging" + role = aws_iam_role.scale_up_chron[0].name + policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", { + log_group_arn = aws_cloudwatch_log_group.scale_up_chron[0].arn + }) +} + +resource "aws_iam_role_policy_attachment" "scale_up_chron_vpc_execution_role" { + count = length(var.lambda_subnet_ids) > 0 ? (var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0) : 0 + role = aws_iam_role.scale_up_chron[0].name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" +} + +resource "aws_iam_role_policy" "scale_up_chron_secretsmanager_access" { + count = var.secretsmanager_secrets_id != null ? (var.retry_scale_up_chron_hud_query_url != "" ? 1 : 0) : 0 + role = aws_iam_role.scale_up_chron[0].name + policy = templatefile("${path.module}/policies/lambda-secretsmanager.json", { + secretsmanager_arn = data.aws_secretsmanager_secret_version.app_creds.arn + }) +} diff --git a/terraform-aws-github-runner/modules/runners/scale-up.tf b/terraform-aws-github-runner/modules/runners/scale-up.tf index 8a47122534..6603bb0d12 100644 --- a/terraform-aws-github-runner/modules/runners/scale-up.tf +++ b/terraform-aws-github-runner/modules/runners/scale-up.tf @@ -65,8 +65,9 @@ resource "aws_lambda_function" "scale_up" { REDIS_LOGIN = var.redis_login RETRY_SCALE_UP_RECORD_DELAY_S = "60" RETRY_SCALE_UP_RECORD_JITTER_PCT = "0.5" - RETRY_SCALE_UP_RECORD_QUEUE_URL = var.sqs_build_queue_retry.url + RETRY_SCALE_UP_CHRON_RECORD_QUEUE_URL = var.sqs_build_queue_retry.url RUNNER_EXTRA_LABELS = var.runner_extra_labels + SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id diff --git a/terraform-aws-github-runner/modules/runners/variables.tf b/terraform-aws-github-runner/modules/runners/variables.tf index 1387af2a47..df7d411007 100644 --- a/terraform-aws-github-runner/modules/runners/variables.tf +++ b/terraform-aws-github-runner/modules/runners/variables.tf @@ -94,6 +94,12 @@ variable "scale_down_schedule_expression" { default = "cron(*/5 * * * ? *)" } +variable "scale_up_chron_schedule_expression" { + description = "Scheduler expression to check every x for scale down." + type = string + default = "cron(*/30 * * * ? *)" # every 30 minutes +} + variable "minimum_running_time_in_minutes" { description = "The time an ec2 action runner should be running at minimum before terminated if non busy." type = number @@ -112,6 +118,12 @@ variable "lambda_timeout_scale_up" { default = 60 } +variable "lambda_timeout_scale_up_chron" { + description = "Time out for the scale up chron lambda in seconds." + type = number + default = 900 +} + variable "role_permissions_boundary" { description = "Permissions boundary that will be added to the created role for the lambda." type = string @@ -285,6 +297,11 @@ variable "role_runner_arn" { type = string } +variable "scale_config_org" { + description = "Organization to fetch scale config from." + type = string +} + variable "scale_config_repo" { description = "Repository to fetch scale config from." default = "" @@ -301,3 +318,8 @@ variable "min_available_runners" { description = "Minimum number of runners to keep available." type = number } + +variable "retry_scale_up_chron_hud_query_url" { + description = "URL used in scale-up-chron to query HUD for queued jobs, if empty scale up cron will not run." + type = string +} diff --git a/terraform-aws-github-runner/variables.tf b/terraform-aws-github-runner/variables.tf index b99af920cf..9232a49644 100644 --- a/terraform-aws-github-runner/variables.tf +++ b/terraform-aws-github-runner/variables.tf @@ -345,6 +345,11 @@ variable "cant_have_issues_labels" { default = [] } +variable "scale_config_org" { + description = "Organization to fetch scale config from." + type = string +} + variable "scale_config_repo" { description = "Repository to fetch scale config from. Optional if `enable_organization_runners` is set to false, in which case the job's repo will be used" default = "" @@ -362,3 +367,9 @@ variable "min_available_runners" { type = number default = 10 } + +variable "retry_scale_up_chron_hud_query_url" { + description = "URL used in scale-up-chron to query HUD for queued jobs, if empty scale up cron will not run." + type = string + default ="" +} diff --git a/tools/device-farm-runner/run_on_aws_devicefarm.py b/tools/device-farm-runner/run_on_aws_devicefarm.py index 37c947c632..7e5de7cce4 100755 --- a/tools/device-farm-runner/run_on_aws_devicefarm.py +++ b/tools/device-farm-runner/run_on_aws_devicefarm.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import copy import datetime import json import logging @@ -9,8 +10,10 @@ import sys import time from argparse import Action, ArgumentParser, Namespace +from dataclasses import asdict, dataclass from enum import Enum from logging import info +from math import inf from typing import Any, Dict, List, Optional from warnings import warn @@ -18,6 +21,7 @@ import requests +# TODO(elainewy): refactor and add unit tests for benchmark test logic POLLING_DELAY_IN_SECOND = 5 MAX_UPLOAD_WAIT_IN_SECOND = 600 @@ -35,6 +39,7 @@ class ReportType(Enum): JOB = "job" SUITE = "suite" TEST = "test" + UNKNOWN = "unknown" DEVICE_FARM_BUCKET = "gha-artifacts" @@ -190,13 +195,37 @@ def parse_args() -> Any: default=0, help="the workflow run attempt", ) + + parser.add_argument( + "--git-job-name", type=str, required=True, help="the name of the git job name." + ) + parser.add_argument( "--output", type=str, help="an optional file to write the list of artifacts from AWS in JSON format", ) - return parser.parse_args() + parser.add_argument( + "--debug", + action="store_true", + help="debug mode, the artifacts won't be uploaded to s3, it should mainly used in local env", + ) + + parser.add_argument( + "--new-json-output-format", + type=str, + choices=["true", "false"], + default="false", + required=False, + help="enable new json artifact output format with mobile job reports and list of artifacts", + ) + + # in case when removing the flag, the mobile jobs does not failed due to unrecognized flag. + args, unknown = parser.parse_known_args() + if len(unknown) > 0: + info(f"detected unknown flags: {unknown}") + return args def upload_file( @@ -262,21 +291,6 @@ def download_artifact(artifact_url: str, local_filename: str) -> str: return local_filename -def upload_file_to_s3( - file_name: str, - bucket: str, - key: str, -) -> None: - """ - Upload a local file to S3 - """ - boto3.client("s3").upload_file( - file_name, - bucket, - key, - ) - - def set_output(val: Any, gh_var_name: str, filename: Optional[str]) -> None: if os.getenv("GITHUB_OUTPUT"): with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env: @@ -290,138 +304,6 @@ def set_output(val: Any, gh_var_name: str, filename: Optional[str]) -> None: print(val, file=f) -def print_testspec( - job_name: Optional[str], - file_name: str, - indent: int = 0, -) -> None: - """ - The test spec output from AWS Device Farm is the main output of the test job. - """ - print(f"::group::{job_name} test output") - with open(file_name) as f: - print(f.read()) - print("::endgroup::") - - -def print_test_artifacts( - client: Any, - app_type: str, - test_arn: str, - workflow_id: str, - workflow_attempt: int, - job_name: Optional[str], - indent: int = 0, -) -> List[Dict[str, str]]: - """ - Return all artifacts from this specific test. There are 3 types of artifacts - from Device Farm including FILE, LOG, and SCREENSHOT - """ - gathered_artifacts = [] - - for artifact_type in ["FILE", "LOG", "SCREENSHOT"]: - r = client.list_artifacts(arn=test_arn, type=artifact_type) - for artifact in r.get("artifacts", []): - filetype = artifact["type"] - filename = artifact["name"].replace(" ", "_") - extension = artifact["extension"].replace(" ", "_") - - local_filename = ( - artifact["arn"].replace(":", "_").replace("/", "_") - + f"_{filename}.{extension}" - ) - s3_key = f"device_farm/{workflow_id}/{workflow_attempt}/{local_filename}" - # Download the artifact locally - upload_file_to_s3( - download_artifact(artifact["url"], local_filename), - DEVICE_FARM_BUCKET, - s3_key, - ) - - s3_url = f"https://{DEVICE_FARM_BUCKET}.s3.amazonaws.com/{s3_key}" - artifact["s3_url"] = s3_url - - info( - f"{' ' * indent}Saving {artifact_type} {filename}.{extension} ({filetype}) " - + f"at {s3_url}" - ) - - # Some more metadata to identify where the artifact comes from - artifact["app_type"] = app_type - artifact["job_name"] = job_name - gathered_artifacts.append(artifact) - - # Additional step to print the test output - if filetype == "TESTSPEC_OUTPUT": - print_testspec(job_name, local_filename, indent + 2) - - return gathered_artifacts - - -def print_report( - client: Any, - app_type: str, - report: Dict[str, Any], - report_type: ReportType, - job_name: Optional[str], - workflow_id: str, - workflow_attempt: int, - indent: int = 0, -) -> List[Dict[str, str]]: - """ - Print the test report from Device Farm in a friendly way and return the list - of any notable artifacts from the test run, i.e. logs and screenshots - """ - if not report: - warn("Missing report, returning...") - return [] - - name = report["name"] - # Keep the top-level job name as the name of the whole report, this - # is used to connect all artifacts from one job together - if report_type == ReportType.JOB: - job_name = name - result = report["result"] - - extra_msg = "" - if report_type == ReportType.SUITE or is_success(result): - counters = report["counters"] - extra_msg = f"with stats {counters}" - - info(f"{' ' * indent}{name} {result} {extra_msg}") - - arn = report["arn"] - if report_type == ReportType.RUN: - more_reports = client.list_jobs(arn=arn) - next_report_type = ReportType.JOB - elif report_type == ReportType.JOB: - more_reports = client.list_suites(arn=arn) - next_report_type = ReportType.SUITE - elif report_type == ReportType.SUITE: - more_reports = client.list_tests(arn=arn) - next_report_type = ReportType.TEST - elif report_type == ReportType.TEST: - return print_test_artifacts( - client, app_type, arn, workflow_id, workflow_attempt, job_name, indent + 2 - ) - - artifacts = [] - for more_report in more_reports.get(f"{next_report_type.value}s", []): - artifacts.extend( - print_report( - client, - app_type, - more_report, - next_report_type, - job_name, - workflow_id, - workflow_attempt, - indent + 2, - ) - ) - return artifacts - - def generate_ios_xctestrun( client: Any, project_arn: str, prefix: str, ios_xctestrun: str, test_spec: str ) -> Dict[str, str]: @@ -531,16 +413,340 @@ def generate_test_configuration( return {"extraDataPackageArn": extra_data_arn} +@dataclass +class DeviceFarmReport: + name: str + arn: str + report_type: str + status: str + result: str + counters: Dict[str, str] + app_type: str + infos: Dict[str, str] + parent_arn: str + + +@dataclass +class JobReport(DeviceFarmReport): + os: str + + +class ReportProcessor: + """ + A helper class to process the modebile test result from AWS Device Farm. + + Usage: + processor = ReportProcessor(...) \n + processor.start(mobile_run_report) + """ + + def __init__( + self, + device_farm_client: Any, + s3_client, + app_type: str, + workflow_id: str, + workflow_attempt: int, + is_debug: bool = False, + ): + self.aws_client = device_farm_client + self.s3_client = s3_client + self.app_type = app_type + self.workflow_id = workflow_id + self.workflow_attempt = workflow_attempt + self.run_report: Optional[DeviceFarmReport] = None + self.job_reports: list[JobReport] = [] + self.test_spec_info_list: list[Dict] = [] + self.is_debug = is_debug + + # todo(elainewy): add main method to pass run arn + def start(self, report: Dict[str, Any]) -> List[Dict[str, str]]: + if not report: + warn("Missing report, returning...") + return [] + + run_arn = report.get("arn", "") + if not run_arn: + warn("Missing arn from input report, returning...") + return [] + + if self.is_debug: + info( + "[DEBUG MODE] the artifacts won't be uploaded to s3, it should mainly used in local env" + ) + + self.run_report = self._to_run_report(report) + + # fetch mobile job report from the run + job_reports_resp = self.aws_client.list_jobs(arn=run_arn) + res = [] + + # fetch artifacts, and sub-reports for each mobile job + for job_report in job_reports_resp.get(ReportType.JOB.value + "s", []): + # info(f"Job Report: {jreport}") + metadata = self._to_job_report(job_report, run_arn) + self.job_reports.append(metadata) + artifacts = self._fetch_artifacts_and_reports( + job_report, + ReportType(metadata.report_type), + metadata, + ) + res.extend(artifacts) + return res + + def _fetch_artifacts_and_reports( + self, + report: Dict[str, Any], + report_type: ReportType, + job_metadata: JobReport, + indent: int = 0, + ) -> List[Dict[str, str]]: + """ + DFS method that tranverse DeviceFarm report from the mobile job level, + identifies and uploads significant artifacts (such as logs and screenshots) to AWS S3, + and returns a comprehensive list of artifact metadata, including relevant mobile job report information. + """ + if not report: + warn("Missing report, returning...") + return [] + + name = report["name"] + result = report["result"] + + extra_msg = "" + if report_type == ReportType.SUITE or is_success(result): + counters = report["counters"] + extra_msg = f"with stats {counters}" + + info(f"{' ' * indent}{name} {result} {extra_msg}") + + arn = report["arn"] + more_reports = {} + if report_type == ReportType.JOB: + more_reports = self.aws_client.list_suites(arn=arn) + next_report_type = ReportType.SUITE + elif report_type == ReportType.SUITE: + more_reports = self.aws_client.list_tests(arn=arn) + next_report_type = ReportType.TEST + elif report_type == ReportType.TEST: + return self._fetch_test_artifacts(arn, job_metadata, indent + 2) + else: + warn(f"Unknown report type {report_type}") + return [] + + artifacts = [] + for more_report in more_reports.get(f"{next_report_type.value}s", []): + artifacts.extend( + self._fetch_artifacts_and_reports( + more_report, + next_report_type, + job_metadata, + indent + 2, + ) + ) + return artifacts + + def _to_job_report( + self, report: Dict[str, Any], parent_arn: str, infos: Dict[str, str] = dict() + ) -> JobReport: + arn = report.get("arn", "") + status = report.get("status", "") + name = report.get("name", "") + result = report.get("result", "") + counters = report.get("counters", "{}") + os = report.get("device", {}).get("os", "") + return JobReport( + arn=arn, + name=name, + app_type=self.app_type, + report_type=ReportType.JOB.value, + status=status, + result=result, + parent_arn=parent_arn, + counters=counters, + infos=infos, + os=os, + ) + + def _to_run_report(self, report: Dict[str, Any], infos: Dict[str, str] = dict()): + arn = report.get("arn", "") + status = report.get("status", "") + name = report.get("name", "") + result = report.get("result", "") + counters = report.get("counters", "{}") + + return DeviceFarmReport( + name=name, + arn=arn, + app_type=self.app_type, + report_type=ReportType.RUN.value, + status=status, + result=result, + counters=counters, + infos=infos, + parent_arn="", + ) + + def _fetch_test_artifacts( + self, test_arn: str, job_metadata: JobReport, indent: int = 0 + ) -> List[Dict[str, str]]: + """ + Return all artifacts from this specific test. There are 3 types of artifacts + from Device Farm including FILE, LOG, and SCREENSHOT + """ + gathered_artifacts = [] + info(f"{' ' * indent} start gathering artifacts") + for artifact_type in ["FILE", "LOG", "SCREENSHOT"]: + r = self.aws_client.list_artifacts(arn=test_arn, type=artifact_type) + for artifact in r.get("artifacts", []): + filetype = artifact["type"] + filename = artifact["name"].replace(" ", "_") + extension = artifact["extension"].replace(" ", "_") + + local_filename = ( + artifact["arn"].replace(":", "_").replace("/", "_") + + f"_{filename}.{extension}" + ) + s3_key = f"device_farm/{self.workflow_id}/{self.workflow_attempt}/{local_filename}" + + # Download the artifact locally + artifact_file = download_artifact(artifact["url"], local_filename) + + if not self.is_debug: + # upload artifacts to s3 bucket + self._upload_file_to_s3(artifact_file, DEVICE_FARM_BUCKET, s3_key) + s3_url = f"https://{DEVICE_FARM_BUCKET}.s3.amazonaws.com/{s3_key}" + artifact["s3_url"] = s3_url + + info( + f"{' ' * indent} Saving {artifact_type} {filename}.{extension} ({filetype}) " + + f"at {s3_url}" + ) + + # Some more metadata to identify where the artifact comes from + artifact["app_type"] = self.app_type + artifact["job_name"] = job_metadata.name + artifact["os"] = job_metadata.os + artifact["job_arn"] = job_metadata.arn + artifact["job_conclusion"] = job_metadata.result + gathered_artifacts.append(artifact) + # Additional step to print the test output + if filetype == "TESTSPEC_OUTPUT": + self.test_spec_info_list.append( + { + "job_name": job_metadata.name, + "os": job_metadata.os, + "job_arn": job_metadata.arn, + "job_conclusion": job_metadata.result, + "local_filename": local_filename, + } + ) + return gathered_artifacts + + def print_test_spec(self) -> None: + info(f"Test Spec Outputs:") + for test_spec_info in self.test_spec_info_list: + self.print_single_testspec( + test_spec_info["job_name"], + test_spec_info["os"], + test_spec_info["job_conclusion"], + test_spec_info["local_filename"], + ) + + def print_single_testspec( + self, + job_name: str, + os: str, + job_conclusion: str, + file_name: str, + ) -> None: + """ + The test spec output from AWS Device Farm is the main output of the test job. + """ + print(f"::group::{job_name} {os} test output [Job Result: {job_conclusion}]") + with open(file_name) as f: + print(f.read()) + print("::endgroup::") + + def get_run_report(self): + if not self.run_report: + warn( + "cannot print run report, run_report is empty, make sure you call start() first" + ) + return DeviceFarmReport( + name="", + arn="", + app_type=self.app_type, + report_type=ReportType.RUN.value, + status="", + result="", + counters={}, + infos={}, + parent_arn="", + ) + return copy.deepcopy(self.run_report) + + def get_job_reports(self): + return copy.deepcopy(self.job_reports) + + def print_run_report(self) -> None: + if not self.run_report: + warn( + "cannot print run report, run_report is empty, make sure you call start() first" + ) + return + d = asdict(self.run_report) + info(f"Run Report Output: {d}") + + def print_job_reports(self) -> None: + info("Job Report Output:") + for r in self.job_reports: + d = json.dumps(asdict(r)) + info(f"{d}") + + def _upload_file_to_s3(self, file_name: str, bucket: str, key: str) -> None: + """ + Upload a local file to S3 + """ + self.s3_client.upload_file( + file_name, + bucket, + key, + ) + + +def generate_artifacts_output( + artifacts: List[Dict[str, str]], + run_report: DeviceFarmReport, + job_reports: List[JobReport], + git_job_name: str, +): + output = { + "artifacts": artifacts, + "run_report": asdict(run_report), + "job_reports": [asdict(job_report) for job_report in job_reports], + "git_job_name": git_job_name, + } + return output + + def main() -> None: args = parse_args() + # (TODO): remove this once remove the flag. + if args.new_json_output_format == "true": + info(f"use new json output format for {args.output}") + else: + info("use legacy json output format for {args.output}") + project_arn = args.project_arn name_prefix = args.name_prefix workflow_id = args.workflow_id workflow_attempt = args.workflow_attempt # NB: Device Farm is only available in us-west-2 region atm - client = boto3.client("devicefarm", region_name=AWS_REGION) + device_farm_client = boto3.client("devicefarm", region_name=AWS_REGION) + unique_prefix = ( f"{name_prefix}-{workflow_id}-{workflow_attempt}-" + f"{datetime.date.today().isoformat()}-{''.join(random.sample(string.ascii_letters, 8))}" @@ -555,7 +761,7 @@ def main() -> None: app_type = "ANDROID_APP" if args.app.endswith(".apk") else "IOS_APP" # Upload the test app appfile_arn = upload_file( - client=client, + client=device_farm_client, project_arn=project_arn, prefix=unique_prefix, filename=args.app, @@ -563,16 +769,22 @@ def main() -> None: ) info(f"Uploaded app: {appfile_arn}") + test_to_run = {} + if args.ios_xctestrun: app_type = "IOS_APP" test_to_run = generate_ios_xctestrun( - client, project_arn, unique_prefix, args.ios_xctestrun, args.test_spec + device_farm_client, + project_arn, + unique_prefix, + args.ios_xctestrun, + args.test_spec, ) if args.android_instrumentation_test: app_type = "ANDROID_APP" test_to_run = generate_android_instrumentation_test( - client, + device_farm_client, project_arn, unique_prefix, args.android_instrumentation_test, @@ -582,11 +794,11 @@ def main() -> None: configuration = {} if args.extra_data: configuration = generate_test_configuration( - client, project_arn, unique_prefix, args.extra_data + device_farm_client, project_arn, unique_prefix, args.extra_data ) # Schedule the test - r = client.schedule_run( + r = device_farm_client.schedule_run( projectArn=project_arn, name=unique_prefix, appArn=appfile_arn, @@ -603,31 +815,42 @@ def main() -> None: result = "" try: while True: - r = client.get_run(arn=run_arn) + r = device_farm_client.get_run(arn=run_arn) state = r["run"]["status"] - if state == "COMPLETED": result = r["run"]["result"] break - waiting_time = datetime.datetime.now() - start_time info(f"Run {unique_prefix} in state {state} after {waiting_time}") time.sleep(30) except Exception as error: warn(f"Failed to run {unique_prefix}: {error}") + # just use the new json output format + json_file = { + "git_job_name": args.git_job_name, + } + set_output(json.dumps(json_file), "artifacts", args.output) sys.exit(1) finally: - artifacts = print_report( - client, - app_type, - r.get("run"), - ReportType.RUN, - None, - workflow_id, - workflow_attempt, + info(f"Run {unique_prefix} finished with state {state} and result {result}") + s3_client = boto3.client("s3") + processor = ReportProcessor( + device_farm_client, s3_client, app_type, workflow_id, workflow_attempt ) - set_output(json.dumps(artifacts), "artifacts", args.output) - + artifacts = processor.start(r.get("run")) + + if args.new_json_output_format == "true": + output = generate_artifacts_output( + artifacts, + processor.get_run_report(), + processor.get_job_reports(), + git_job_name=args.git_job_name, + ) + set_output(json.dumps(output), "artifacts", args.output) + else: + info("Generating legacy json output") + set_output(json.dumps(artifacts), "artifacts", args.output) + processor.print_test_spec() if not is_success(result): sys.exit(1) diff --git a/tools/device-farm-runner/test_run_on_aws_devicefarm.py b/tools/device-farm-runner/test_run_on_aws_devicefarm.py new file mode 100644 index 0000000000..20c1dbd565 --- /dev/null +++ b/tools/device-farm-runner/test_run_on_aws_devicefarm.py @@ -0,0 +1,314 @@ +import copy +import json +import unittest +from re import M +from typing import Any, Dict +from unittest import mock +from unittest.mock import MagicMock + +from run_on_aws_devicefarm import download_artifact, ReportProcessor + + +class MockS3Client: + def __init__(self): + self.mock_aws_client = MagicMock() + self.mock_aws_client.upload_file.return_value = None + + def getMockClient(self) -> MagicMock: + return self.mock_aws_client + + +class MockDeviceFarmClient: + def __init__(self): + self.mock_aws_client = MagicMock() + self.mock_aws_client.list_jobs.return_value = self.mockJobs() + self.mock_aws_client.list_suites.return_value = self.mockSuites() + self.mock_aws_client.list_tests.return_value = self.mockTests() + self.mock_aws_client.list_artifacts.side_effect = ( + lambda arn, type: self.getArtifacts(arn, type) + ) + + def getMockClient(self) -> MagicMock: + return self.mock_aws_client + + def getArtifacts(self, arn: str, type: str): + if type == "FILE": + return { + "artifacts": [ + { + "type": "TESTSPEC_OUTPUT", + "name": "test spec output", + "extension": "output", + "arn": "arn-artifact1", + "url": "url-artifact1", + }, + ] + } + elif type == "LOG": + return { + "artifacts": [ + { + "type": "LOG", + "name": "test log", + "extension": "output", + "arn": "arn-artifact2", + "url": "url-artifact2", + }, + ] + } + else: + return { + "artifacts": [ + { + "type": "VIDEO", + "name": "test video", + "extension": "output", + "arn": "arn-artifac3", + "url": "url-artifact3", + }, + ] + } + + def mockTests(self) -> Any: + return { + "tests": [ + { + "arn": "arn-test1", + "name": "Setup Test", + "status": "COMPLETED", + "result": "PASSED", + "counters": { + "total": 1, + "passed": 1, + "failed": 0, + "warned": 0, + "errored": 0, + "stopped": 0, + "skipped": 0, + }, + "message": "Successful test lifecycle of Setup Test", + } + ], + "ResponseMetadata": {}, + } + + def mockSuites(self): + return { + "suites": [ + { + "arn": "arn-suite1", + "name": "Setup Suite", + "status": "COMPLETED", + "result": "PASSED", + "counters": { + "total": 1, + "passed": 1, + "failed": 0, + "warned": 0, + "errored": 0, + "stopped": 0, + "skipped": 0, + }, + "message": "Successful", + }, + { + "arn": "arn-suite2", + "name": "Tests Suite", + "status": "COMPLETED", + "result": "FAILED", + "counters": { + "total": 1, + "passed": 1, + "failed": 0, + "warned": 0, + "errored": 0, + "stopped": 0, + "skipped": 0, + }, + "message": "Tests passed", + }, + ], + "ResponseMetadata": {}, + } + + def mockJobs(self): + return { + "jobs": [ + { + "arn": "arn-job-1", + "name": "Apple iPhone 15", + "status": "COMPLETED", + "result": "PASSED", + "counters": { + "total": 3, + "passed": 3, + "failed": 0, + "warned": 0, + "errored": 0, + "stopped": 0, + "skipped": 0, + }, + "message": "fake1", + "device": { + "arn": "device:00", + "name": "Apple iPhone 15", + "manufacturer": "Apple", + "model": "Apple iPhone 15", + "modelId": "A2846", + "formFactor": "PHONE", + "platform": "IOS", + "os": "18.0", + }, + }, + { + "arn": "arn-job-2", + "name": "Apple iPhone 17 Pro", + "status": "FAILED", + "result": "PASSED", + "counters": { + "total": 3, + "passed": 3, + "failed": 0, + "warned": 0, + "errored": 0, + "stopped": 0, + "skipped": 0, + }, + "message": "fake1", + "device": { + "arn": "device:00", + "name": "Apple iPhone 15", + "manufacturer": "Apple", + "model": "Apple iPhone 15", + "modelId": "A2846", + "formFactor": "PHONE", + "platform": "IOS", + "os": "11.0", + }, + }, + ] + } + + +class Test(unittest.TestCase): + @mock.patch("run_on_aws_devicefarm.download_artifact") + def test_reportProcessor_when_happyFlow_then_returnArtifacts( + self, download_artifact_mock + ): + # setup + m_df = MockDeviceFarmClient() + m_s3 = MockS3Client() + fakeReport = { + "name": "test", + "arn": "arn-run-report", + "status": "COMPLETED", + "result": "PASSED", + "counters": {"total": 3, "passed": 3, "failed": 0, "warned": 0}, + } + processor = ReportProcessor( + m_df.getMockClient(), m_s3.getMockClient(), "IOS", "wf1", 1 + ) + + # execute + artifacts = processor.start(fakeReport) + + # assert aws client calls + expectedNumArtifacts = 12 + m_df.getMockClient().list_jobs.assert_called_once() + self.assertGreaterEqual(m_df.getMockClient().list_suites.call_count, 2) + self.assertGreaterEqual( + m_s3.getMockClient().upload_file.call_count, expectedNumArtifacts + ) + + # assert artifacts + self.assertGreaterEqual(len(artifacts), expectedNumArtifacts) + + job1_artifacts = [ + artifact for artifact in artifacts if artifact.get("job_arn") == "arn-job-1" + ] + a1 = job1_artifacts[0] + self.assertEqual(a1["app_type"], "IOS") + self.assertEqual(a1["job_arn"], "arn-job-1") + self.assertEqual(a1["job_conclusion"], "PASSED") + self.assertEqual(a1["job_name"], "Apple iPhone 15") + self.assertEqual(a1["os"], "18.0") + self.assertEqual(a1["name"], "test spec output") + + job2_artifacts = [ + artifact for artifact in artifacts if artifact["job_arn"] == "arn-job-2" + ] + a2 = job2_artifacts[0] + self.assertEqual(a2["app_type"], "IOS") + self.assertEqual(a2["job_arn"], "arn-job-2") + self.assertEqual(a2["job_conclusion"], "PASSED") + self.assertEqual(a2["job_name"], "Apple iPhone 17 Pro") + self.assertEqual(a2["os"], "11.0") + self.assertEqual(a2["name"], "test spec output") + + @mock.patch("run_on_aws_devicefarm.download_artifact") + def test_reportProcessor_when_enableDebugMode_then_noArtifactsAreUploaded( + self, download_artifact_mock + ): + # setup + m_df = MockDeviceFarmClient() + m_s3 = MockS3Client() + fakeReport = { + "name": "test", + "arn": "arn-run-report", + "status": "COMPLETED", + "result": "PASSED", + "counters": {"total": 3, "passed": 3, "failed": 0, "warned": 0}, + } + processor = ReportProcessor( + m_df.getMockClient(), m_s3.getMockClient(), "IOS", "wf1", 1, True + ) + + # execute + artifacts = processor.start(fakeReport) + + # assert aws client calls + expectedNumArtifacts = 12 + m_df.getMockClient().list_jobs.assert_called_once() + self.assertGreaterEqual(download_artifact_mock.call_count, expectedNumArtifacts) + self.assertGreaterEqual(len(artifacts), expectedNumArtifacts) + + self.assertEqual(m_s3.getMockClient().upload_file.call_count, 0) + + def test_reportProcessor_when_hasEmptyReportInput_then_returnEmptyList(self): + # setup + m_df = MockDeviceFarmClient() + m_s3 = MockS3Client() + fakeReport = {} + processor = ReportProcessor( + m_df.getMockClient(), m_s3.getMockClient(), "IOS", "wf1", 1, True + ) + + # execute + artifacts = processor.start(fakeReport) + + # assert + self.assertEqual(m_df.getMockClient().list_jobs.call_count, 0) + self.assertEqual(len(artifacts), 0) + + def test_reportProcessor_when_missingArnInReportInput_then_returnEmptyList(self): + # setup + m_df = MockDeviceFarmClient() + m_s3 = MockS3Client() + fakeReport = { + "status": "COMPLETED", + "result": "PASSED", + } + processor = ReportProcessor( + m_df.getMockClient(), m_s3.getMockClient(), "IOS", "wf1", 1 + ) + + # execute + artifacts = processor.start(fakeReport) + + # assert + self.assertEqual(m_df.getMockClient().list_jobs.call_count, 0) + self.assertEqual(len(artifacts), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/scripts/generate_binary_build_matrix.py b/tools/scripts/generate_binary_build_matrix.py index d012e03f41..91aebdbc28 100755 --- a/tools/scripts/generate_binary_build_matrix.py +++ b/tools/scripts/generate_binary_build_matrix.py @@ -23,17 +23,17 @@ PYTHON_ARCHES_DICT = { "nightly": ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"], - "test": ["3.9", "3.10", "3.11", "3.12", "3.13"], + "test": ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"], "release": ["3.9", "3.10", "3.11", "3.12", "3.13"], } CUDA_ARCHES_DICT = { "nightly": ["11.8", "12.6", "12.8"], - "test": ["11.8", "12.4", "12.6"], + "test": ["11.8", "12.6", "12.8"], "release": ["11.8", "12.4", "12.6"], } ROCM_ARCHES_DICT = { "nightly": ["6.2.4", "6.3"], - "test": ["6.1", "6.2.4"], + "test": ["6.2.4", "6.3"], "release": ["6.1", "6.2.4"], } @@ -46,7 +46,7 @@ STABLE_CUDA_VERSIONS = { "nightly": "12.6", - "test": "12.4", + "test": "12.6", "release": "12.4", } @@ -76,7 +76,7 @@ CURRENT_NIGHTLY_VERSION = "2.7.0" -CURRENT_CANDIDATE_VERSION = "2.6.0" +CURRENT_CANDIDATE_VERSION = "2.7.0" CURRENT_STABLE_VERSION = "2.6.0" CURRENT_VERSION = CURRENT_STABLE_VERSION diff --git a/torchci/components/GroupJobConclusion.tsx b/torchci/components/GroupJobConclusion.tsx index 932f62a545..9ffa43004c 100644 --- a/torchci/components/GroupJobConclusion.tsx +++ b/torchci/components/GroupJobConclusion.tsx @@ -6,9 +6,13 @@ import { isUnstableJob, } from "lib/jobUtils"; import { IssueData, JobData } from "lib/types"; -import { PinnedTooltipContext } from "pages/hud/[repoOwner]/[repoName]/[branch]/[[...page]]"; +import { + MonsterFailuresContext, + PinnedTooltipContext, +} from "pages/hud/[repoOwner]/[repoName]/[branch]/[[...page]]"; import { useContext } from "react"; import hudStyles from "./hud.module.css"; +import { getFailureEl } from "./JobConclusion"; import styles from "./JobConclusion.module.css"; import { SingleWorkflowDispatcher } from "./WorkflowDispatcher"; @@ -65,6 +69,94 @@ function isJobViableStrictBlocking( return false; } +// React component to render either a group conclusion character or monsterized icons for failures +function GroupConclusionContent({ + conclusion, + isClassified, + erroredJobs, + toggleExpanded, + monsterFailures, +}: { + conclusion: GroupedJobStatus; + isClassified: boolean; + erroredJobs: JobData[]; + toggleExpanded: () => void; + monsterFailures: boolean; +}) { + // Only show monsters for failures and when monsterized failures is enabled + if (conclusion !== GroupedJobStatus.Failure || !monsterFailures) { + return ( + + {getGroupConclusionChar(conclusion)} + + ); + } + + // Get only unique monster icons based on their sprite index + const seenMonsterSprites = new Set(); + const allMonsters = []; + + for (const job of erroredJobs) { + if (job.failureLines && job.failureLines[0]) { + const monsterEl = getFailureEl(JobStatus.Failure, job); + if (monsterEl) { + // Get the sprite index from the data attribute + const spriteIdx = monsterEl.props["data-monster-hash"]; + + if (!seenMonsterSprites.has(spriteIdx)) { + seenMonsterSprites.add(spriteIdx); + allMonsters.push(monsterEl); + } + } + } + } + + if (allMonsters.length === 0) { + // Fallback to X if no monsters could be generated + return ( + + {getGroupConclusionChar(conclusion)} + + ); + } + + // Show the first monster icon with a count in bottom right + const firstMonster = allMonsters[0]; + + return ( + + {firstMonster} + {allMonsters.length > 1 && ( + {allMonsters.length} + )} + + ); +} + export default function HudGroupedCell({ sha, groupName, @@ -87,6 +179,7 @@ export default function HudGroupedCell({ repoName: string; }) { const [pinnedId, setPinnedId] = useContext(PinnedTooltipContext); + const [monsterFailures] = useContext(MonsterFailuresContext); const style = pinnedId.name == groupName ? hudStyles.highlight : ""; const erroredJobs = []; @@ -153,26 +246,38 @@ export default function HudGroupedCell({ /> } > - + {monsterFailures && conclusion === GroupedJobStatus.Failure ? ( + + + + + + ) : ( - {getGroupConclusionChar(conclusion)} + - + )} @@ -196,7 +301,64 @@ function GroupTooltip({ failedPreviousRunJobs: JobData[]; sha?: string; }) { + const [monsterFailures] = useContext(MonsterFailuresContext); + if (conclusion === GroupedJobStatus.Failure) { + // Show monster icons in the tooltip if monsterFailures is enabled + if (monsterFailures) { + // Group jobs by monster sprite index + const monsterGroups = new Map(); // Map of spriteIdx -> {monsterEl, jobs[]} + + for (const job of erroredJobs) { + if (job.failureLines && job.failureLines[0]) { + const monsterEl = getFailureEl(JobStatus.Failure, job); + if (monsterEl) { + // Get the sprite index from the data attribute + const spriteIdx = monsterEl.props["data-monster-hash"]; + + if (!monsterGroups.has(spriteIdx)) { + monsterGroups.set(spriteIdx, { monsterEl, jobs: [] }); + } + + // Add this job to the group with this monster + monsterGroups.get(spriteIdx).jobs.push(job); + } + } + } + + // Convert the map to an array for rendering + const monsterGroupsArray = Array.from(monsterGroups.values()); + + return ( +
+ {`[${conclusion}] ${groupName}`} +
The following jobs errored out:
+ {monsterGroupsArray.map((group, groupIndex) => ( +
+
+ {group.monsterEl} + + {group.jobs.length > 1 + ? `${group.jobs.length} jobs with this error type:` + : "1 job with this error type:"} + +
+ {group.jobs.map((job: JobData, jobIndex: number) => ( + + ))} +
+ ))} +
+ ); + } + return ( { +export const getFailureEl = (conclusion?: string, jobData?: JobData) => { if ( conclusion !== JobStatus.Failure || !jobData?.failureLines || @@ -38,6 +38,7 @@ const getFailureEl = (conclusion?: string, jobData?: JobData) => {
); }; diff --git a/torchci/components/WorkflowBox.tsx b/torchci/components/WorkflowBox.tsx index 1c14369354..48f4685c06 100644 --- a/torchci/components/WorkflowBox.tsx +++ b/torchci/components/WorkflowBox.tsx @@ -384,6 +384,7 @@ function groupArtifacts(jobs: JobData[], artifacts: Artifact[]) { key = id; } } finally { + key = key.toString(); if (!grouping.has(key)) { grouping.set(key, []); } diff --git a/torchci/components/metrics/panels/TimeSeriesPanel.tsx b/torchci/components/metrics/panels/TimeSeriesPanel.tsx index 5a55ef867e..51685aa49a 100644 --- a/torchci/components/metrics/panels/TimeSeriesPanel.tsx +++ b/torchci/components/metrics/panels/TimeSeriesPanel.tsx @@ -39,7 +39,8 @@ export function seriesWithInterpolatedTimes( smooth: boolean = true, sort_by: "total" | "name" = "name", graph_type: ChartType = "line", - filter: string | undefined = undefined + filter: string | undefined = undefined, + isRegex: boolean = false ) { // We want to interpolate the data, filling any "holes" in our time series // with 0. @@ -118,9 +119,21 @@ export function seriesWithInterpolatedTimes( return serie; }); if (filter) { - series = series.filter((s) => - s.name.toLocaleLowerCase().includes(filter.toLocaleLowerCase()) - ); + if (isRegex) { + try { + const regex = new RegExp(filter, "i"); + series = series.filter((s) => regex.test(s.name)); + } catch (e) { + // If regex is invalid, fall back to simple include + series = series.filter((s) => + s.name.toLocaleLowerCase().includes(filter.toLocaleLowerCase()) + ); + } + } else { + series = series.filter((s) => + s.name.toLocaleLowerCase().includes(filter.toLocaleLowerCase()) + ); + } } if (sort_by === "name") { return _.sortBy(series, (x) => x.name); @@ -288,6 +301,7 @@ export default function TimeSeriesPanel({ sort_by = "name", max_items_in_series = 0, filter = undefined, + isRegex = false, auto_refresh = true, // Additional function to process the data after querying dataReader = undefined, @@ -308,6 +322,7 @@ export default function TimeSeriesPanel({ sort_by?: "total" | "name"; max_items_in_series?: number; filter?: string; + isRegex?: boolean; auto_refresh?: boolean; dataReader?: (_data: { [k: string]: any }[]) => { [k: string]: any }[]; }) { @@ -350,7 +365,8 @@ export default function TimeSeriesPanel({ smooth, sort_by, chartType, - filter + filter, + isRegex ); // If we have too many series, we'll only show the top N series by total value diff --git a/torchci/pages/cost_analysis.tsx b/torchci/pages/cost_analysis.tsx index 649793104f..8782272295 100644 --- a/torchci/pages/cost_analysis.tsx +++ b/torchci/pages/cost_analysis.tsx @@ -24,8 +24,9 @@ import TimeSeriesPanel, { import MultiSelectPicker from "components/MultiSelectPicker"; import dayjs from "dayjs"; import { fetcher } from "lib/GeneralUtils"; +import _ from "lodash"; import { useRouter } from "next/router"; -import React, { useEffect, useState } from "react"; +import React, { useCallback, useEffect, useState } from "react"; import { BiLineChart } from "react-icons/bi"; import { FaFilter, FaInfoCircle, FaRegChartBar } from "react-icons/fa"; import { MdOutlineStackedBarChart } from "react-icons/md"; @@ -227,6 +228,7 @@ export default function Page() { : PROVIDER_OPTIONS; const initialSelectedYAxis = (query.yAxis as YAxis) || "cost"; const initialSearchFilter = query.searchFilter || ""; + const initialIsRegex = query.isRegex === "true"; const initialSelectedRepos = query.repos ? splitString(query.repos) : undefined; @@ -260,6 +262,7 @@ export default function Page() { const [searchFilter, setSearchFilter] = useState( initialSearchFilter as string ); + const [isRegex, setIsRegex] = useState(initialIsRegex); const [routerReady, setRouterReady] = useState(false); @@ -277,6 +280,7 @@ export default function Page() { setSelectedProviders(initialSelectedProviders); setSelectedYAxis(initialSelectedYAxis || "cost"); setSearchFilter(initialSearchFilter as string); + setIsRegex(initialIsRegex); } const timeParamsClickHouse = { @@ -343,6 +347,7 @@ export default function Page() { if (selectedYAxis) params.set("yAxis", selectedYAxis); if (searchFilter) params.set("searchFilter", searchFilter); + if (isRegex) params.set("isRegex", isRegex.toString()); router.push({ pathname: router.pathname, @@ -361,6 +366,7 @@ export default function Page() { selectedOwners, selectedYAxis, searchFilter, + isRegex, selectedRepos, ]); @@ -391,6 +397,7 @@ export default function Page() { smooth={false} chartType={chartType} filter={searchFilter} + isRegex={isRegex} timeFieldDisplayFormat="M/D (UTC)" sort_by="total" auto_refresh={false} @@ -559,28 +566,88 @@ export default function Page() { ); }; + // Create debounced search filter update function - defined once + const debouncedSetSearchFilter = useCallback( + _.debounce((value: string) => { + setSearchFilter(value); + }, 500), + [] // Empty dependency array ensures this is created only once + ); + + // Local state for input value to keep input responsive + const [inputValue, setInputValue] = useState(initialSearchFilter || ""); + + // Update inputValue when searchFilter changes from URL/elsewhere + useEffect(() => { + setInputValue(searchFilter); + }, [searchFilter]); + const generateFilterBar = (type: CostCategory, style = {}) => { + // Update the local input value immediately for responsiveness const handleChange = (e: React.ChangeEvent) => { const value = e.target.value; - setTimeout(() => { - setSearchFilter(() => { - return value; - }); - }, 500); + setInputValue(value); + debouncedSetSearchFilter(value); + }; + + const handleRegexChange = (e: React.ChangeEvent) => { + setIsRegex(e.target.checked); }; return ( - - Filter {type} -
- } - onChange={handleChange} - variant="outlined" - fullWidth - /> + + + Filter {type} + + } + onChange={handleChange} + variant="outlined" + fullWidth + value={inputValue} + InputProps={{ + endAdornment: ( + +
setIsRegex(!isRegex)} + > +
+ .* +
+
+
+ ), + }} + /> +
); };