Merge branch 'main' into addUtilForLinuxBuild #4
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: linux-build | ||
| on: | ||
| workflow_call: | ||
| inputs: | ||
| build-environment: | ||
| required: true | ||
| type: string | ||
| description: Top-level label for what's being built/tested. | ||
| docker-image-name: | ||
| required: true | ||
| type: string | ||
| description: Name of the base docker image to build with. | ||
| build-generates-artifacts: | ||
| required: false | ||
| type: boolean | ||
| default: true | ||
| description: If set, upload generated build artifacts. | ||
| build-with-debug: | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
| description: If set, build in debug mode. | ||
| sync-tag: | ||
| required: false | ||
| type: string | ||
| default: "" | ||
| description: | | ||
| If this is set, our linter will use this to make sure that every other | ||
| job with the same `sync-tag` is identical. | ||
| cuda-arch-list: | ||
| required: false | ||
| type: string | ||
| default: "5.2" | ||
| description: | | ||
| List of CUDA architectures CI build should target. | ||
| runner_prefix: | ||
| required: false | ||
| default: "" | ||
| type: string | ||
| description: Prefix for runner label | ||
| runner: | ||
| required: false | ||
| type: string | ||
| default: "linux.2xlarge" | ||
| description: | | ||
| Label of the runner this job should run on. | ||
| test-matrix: | ||
| required: false | ||
| type: string | ||
| description: | | ||
| An option JSON description of what test configs to run later on. This | ||
| is moved here from the Linux test workflow so that we can apply filter | ||
| logic using test-config labels earlier and skip unnecessary builds | ||
| selected-test-configs: | ||
| description: | | ||
| A comma-separated list of test configurations from the test matrix to keep, | ||
| The empty list means we are going to keep every configurations by defaults | ||
| required: false | ||
| type: string | ||
| default: "" | ||
| s3-bucket: | ||
| description: S3 bucket to download artifact | ||
| required: false | ||
| type: string | ||
| default: "gha-artifacts" | ||
| aws-role-to-assume: | ||
| description: Role to assume for downloading artifacts | ||
| required: false | ||
| type: string | ||
| default: "" | ||
| max-jobs: | ||
| description: | | ||
| Overwrite the number of jobs to use for the build | ||
| required: false | ||
| type: string | ||
| disable-monitor: | ||
| description: | | ||
| Disable utilization monitoring for build job | ||
| required: false | ||
| type: boolean | ||
| default: false | ||
| secrets: | ||
| HUGGING_FACE_HUB_TOKEN: | ||
| required: false | ||
| description: | | ||
| HF Auth token to avoid rate limits when downloading models or datasets from hub | ||
| SCRIBE_GRAPHQL_ACCESS_TOKEN: | ||
| required: false | ||
| description: | | ||
| FB app token to write to scribe endpoint | ||
| outputs: | ||
| docker-image: | ||
| value: ${{ jobs.build.outputs.docker-image }} | ||
| description: The docker image containing the built PyTorch. | ||
| test-matrix: | ||
| value: ${{ jobs.build.outputs.test-matrix }} | ||
| description: An optional JSON description of what test configs to run later on. | ||
| jobs: | ||
| build: | ||
| environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }} | ||
| # Don't run on forked repos | ||
| if: github.repository_owner == 'pytorch' | ||
| runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }} | ||
| timeout-minutes: 240 | ||
| outputs: | ||
| docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} | ||
| test-matrix: ${{ steps.filter.outputs.test-matrix }} | ||
| steps: | ||
| - name: Setup SSH (Click me for login details) | ||
| uses: pytorch/test-infra/.github/actions/setup-ssh@main | ||
| if: inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| with: | ||
| github-secret: ${{ secrets.GITHUB_TOKEN }} | ||
| # [pytorch repo ref] | ||
| # Use a pytorch/pytorch reference instead of a reference to the local | ||
| # checkout because when we run this action we don't *have* a local | ||
| # checkout. In other cases you should prefer a local checkout. | ||
| - name: Checkout PyTorch | ||
| uses: pytorch/pytorch/.github/actions/checkout-pytorch@main | ||
| with: | ||
| no-sudo: true | ||
| - name: Setup Linux | ||
| uses: ./.github/actions/setup-linux | ||
| if: inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| - name: configure aws credentials | ||
| uses: aws-actions/configure-aws-credentials@v3 | ||
| if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }} | ||
| with: | ||
| role-to-assume: ${{ inputs.aws-role-to-assume }} | ||
| role-session-name: gha-linux-build | ||
| aws-region: us-east-1 | ||
| - name: Calculate docker image | ||
| id: calculate-docker-image | ||
| uses: pytorch/test-infra/.github/actions/calculate-docker-image@main | ||
| if: inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| with: | ||
| docker-image-name: ${{ inputs.docker-image-name }} | ||
| - name: Use following to pull public copy of the image | ||
| id: print-ghcr-mirror | ||
| if: inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| env: | ||
| ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} | ||
| shell: bash | ||
| run: | | ||
| tag=${ECR_DOCKER_IMAGE##*/} | ||
| echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" | ||
| - name: Pull docker image | ||
| uses: pytorch/test-infra/.github/actions/pull-docker-image@main | ||
| if: inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| with: | ||
| docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} | ||
| - name: Parse ref | ||
| id: parse-ref | ||
| run: .github/scripts/parse_ref.py | ||
| - name: Get workflow job id | ||
| id: get-job-id | ||
| uses: ./.github/actions/get-workflow-job-id | ||
| if: always() | ||
| with: | ||
| github-token: ${{ secrets.GITHUB_TOKEN }} | ||
| # Apply the filter logic to the build step too if the test-config label is already there | ||
| - name: Select all requested test configurations (if the test matrix is available) | ||
| id: filter | ||
| uses: ./.github/actions/filter-test-configs | ||
| with: | ||
| github-token: ${{ secrets.GITHUB_TOKEN }} | ||
| test-matrix: ${{ inputs.test-matrix }} | ||
| selected-test-configs: ${{ inputs.selected-test-configs }} | ||
| job-name: ${{ steps.get-job-id.outputs.job-name }} | ||
| - name: Start monitoring script | ||
| id: monitor-script | ||
| if: ${{ !inputs.disable-monitor }} | ||
| shell: bash | ||
| continue-on-error: true | ||
| env: | ||
| JOB_ID: ${{ steps.get-job-id.outputs.job-id }} | ||
| JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} | ||
| WORKFLOW_NAME: ${{ github.workflow }} | ||
| WORKFLOW_RUN_ID: ${{ github.run_id }} | ||
| run: | | ||
| python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7 | ||
| python3 -m tools.stats.monitor > "usage_log_build_${JOB_ID}.txt" 2>&1 & | ||
| echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" | ||
| - name: Download pytest cache | ||
| uses: ./.github/actions/pytest-cache-download | ||
| continue-on-error: true | ||
| if: inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| with: | ||
| cache_dir: .pytest_cache | ||
| job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} | ||
| s3_bucket: ${{ inputs.s3-bucket }} | ||
| - name: Build | ||
| if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '' | ||
| id: build | ||
| env: | ||
| BUILD_ENVIRONMENT: ${{ inputs.build-environment }} | ||
| BRANCH: ${{ steps.parse-ref.outputs.branch }} | ||
| # TODO duplicated | ||
| AWS_DEFAULT_REGION: us-east-1 | ||
| PR_NUMBER: ${{ github.event.pull_request.number }} | ||
| SHA1: ${{ github.event.pull_request.head.sha || github.sha }} | ||
| # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs | ||
| SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 | ||
| SCCACHE_REGION: us-east-1 | ||
| XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla | ||
| PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} | ||
| TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }} | ||
| DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} | ||
| DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }} | ||
| XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} | ||
| DEBUG: ${{ inputs.build-with-debug && '1' || '0' }} | ||
| OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} | ||
| HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} | ||
| SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} | ||
| MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }} | ||
| run: | | ||
| START_TIME=$(date +%s) | ||
| if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then | ||
| JENKINS_USER= | ||
| USED_IMAGE="${DOCKER_IMAGE_S390X}" | ||
| # ensure that docker container cleanly exits in 12 hours | ||
| # if for some reason cleanup action doesn't stop container | ||
| # when job is cancelled | ||
| DOCKER_SHELL_CMD="sleep 12h" | ||
| # since some steps are skipped on s390x, if they are necessary, run them here | ||
| env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" | ||
| env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" | ||
| else | ||
| JENKINS_USER="--user jenkins" | ||
| USED_IMAGE="${DOCKER_IMAGE}" | ||
| DOCKER_SHELL_CMD= | ||
| fi | ||
| if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then | ||
| MAX_JOBS="$(nproc --ignore=2)" | ||
| else | ||
| MAX_JOBS="${MAX_JOBS_OVERRIDE}" | ||
| fi | ||
| # Leaving 1GB for the runner and other things | ||
| TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo) | ||
| # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap | ||
| # comes from https://github.com/pytorch/test-infra/pull/6058 | ||
| TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3)) | ||
| # detached container should get cleaned up by teardown_ec2_linux | ||
| # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty | ||
| # shellcheck disable=SC2086 | ||
| container_name=$(docker run \ | ||
| -e BUILD_ENVIRONMENT \ | ||
| -e MAX_JOBS=${MAX_JOBS} \ | ||
| -e MAX_JOBS_OVERRIDE \ | ||
| -e AWS_DEFAULT_REGION \ | ||
| -e PR_NUMBER \ | ||
| -e SHA1 \ | ||
| -e BRANCH \ | ||
| -e SCCACHE_BUCKET \ | ||
| -e SCCACHE_REGION \ | ||
| -e XLA_CUDA \ | ||
| -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ | ||
| -e SKIP_SCCACHE_INITIALIZATION=1 \ | ||
| -e TORCH_CUDA_ARCH_LIST \ | ||
| -e PR_LABELS \ | ||
| -e OUR_GITHUB_JOB_ID \ | ||
| -e HUGGING_FACE_HUB_TOKEN \ | ||
| -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ | ||
| -e USE_SPLIT_BUILD \ | ||
| --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \ | ||
| --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \ | ||
| --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ | ||
| --security-opt seccomp=unconfined \ | ||
| --cap-add=SYS_PTRACE \ | ||
| --tty \ | ||
| --detach \ | ||
| ${JENKINS_USER} \ | ||
| -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ | ||
| -w /var/lib/jenkins/workspace \ | ||
| "${USED_IMAGE}" \ | ||
| ${DOCKER_SHELL_CMD} | ||
| ) | ||
| docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh' | ||
| END_TIME=$(date +%s) | ||
| echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT" | ||
| - name: Archive artifacts into zip | ||
| if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' | ||
| run: | | ||
| zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files | ||
| - name: Store PyTorch Build Artifacts on S3 | ||
| uses: seemethere/upload-artifact-s3@v5 | ||
| if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| with: | ||
| name: ${{ inputs.build-environment }} | ||
| retention-days: 14 | ||
| if-no-files-found: error | ||
| path: artifacts.zip | ||
| s3-bucket: ${{ inputs.s3-bucket }} | ||
| - name: Store PyTorch Build Artifacts for s390x | ||
| uses: actions/upload-artifact@v4 | ||
| if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel' | ||
| with: | ||
| name: ${{ inputs.build-environment }} | ||
| retention-days: 14 | ||
| if-no-files-found: error | ||
| path: artifacts.zip | ||
| - name: Stop monitoring script | ||
| if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }} | ||
| shell: bash | ||
| continue-on-error: true | ||
| env: | ||
| MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} | ||
| run: | | ||
| kill "$MONITOR_SCRIPT_PID" | ||
| - name: Check all txt | ||
| shell: bash | ||
| continue-on-error:true | ||
| run: | | ||
| ls *.txt | ||
| - name: Upload raw usage log to s3 | ||
| if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}} | ||
| uses: seemethere/upload-artifact-s3@v5 | ||
| with: | ||
| s3-prefix: | | ||
| ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact | ||
| retention-days: 14 | ||
| if-no-files-found: warn | ||
| path: usage_log_build_*.txt | ||
| - name: Upload sccache stats | ||
| if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| uses: ./.github/actions/upload-sccache-stats | ||
| with: | ||
| github-token: ${{ secrets.GITHUB_TOKEN }} | ||
| build-time: ${{ steps.build.outputs.build_time }} | ||
| - name: Upload utilization stats | ||
| if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }} | ||
| continue-on-error: true | ||
| uses: ./.github/actions/upload-utilization-stats | ||
| with: | ||
| job_id: ${{ steps.get-job-id.outputs.job-id }} | ||
| job_name: ${{ steps.get-job-id.outputs.job-name }} | ||
| workflow_name: ${{ github.workflow }} | ||
| workflow_run_id: ${{github.run_id}} | ||
| workflow_attempt: ${{github.run_attempt}} | ||
| artifact_prefix: usage_log_build | ||
| - name: Teardown Linux | ||
| uses: pytorch/test-infra/.github/actions/teardown-linux@main | ||
| if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' | ||
| - name: Cleanup docker | ||
| if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel' | ||
| shell: bash | ||
| run: | | ||
| # on s390x stop the container for clean worker stop | ||
| docker stop -a || true | ||
| docker kill -a || true | ||