Tests #1835
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright 2023–2025 Google LLC | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # https://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # This workflow will install Python dependencies, run tests and lint with a variety of Python versions | |
| # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python | |
| name: Tests | |
| on: | |
| pull_request: | |
| workflow_dispatch: | |
| schedule: | |
| # Run the job every 4 hours | |
| - cron: '0 */4 * * *' | |
| concurrency: | |
| # Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else | |
| group: > | |
| ${{ | |
| github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) || | |
| github.event_name == 'schedule' && format('{0}-schedule', github.workflow) || | |
| github.run_id | |
| }} | |
| cancel-in-progress: true | |
| jobs: | |
| prelim: | |
| runs-on: ["self-hosted"] | |
| steps: | |
| - name: Test gsutil installation | |
| run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;} | |
| - name: Cleanup old docker images | |
| run: docker system prune --all --force | |
| tpu_image: | |
| needs: prelim | |
| uses: ./.github/workflows/build_upload_internal.yml | |
| with: | |
| device_type: tpu | |
| device_name: v4-8 | |
| cloud_runner: linux-x86-n2-16-buildkit | |
| build_mode: jax_ai_image | |
| base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest | |
| gpu_image: | |
| needs: prelim | |
| uses: ./.github/workflows/build_upload_internal.yml | |
| with: | |
| device_type: gpu | |
| device_name: a100-40gb-4 | |
| cloud_runner: linux-x86-n2-16-buildkit | |
| build_mode: jax_ai_image | |
| base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest | |
| cpu_unit_tests: | |
| needs: tpu_image | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| worker_group: [1, 2, 3, 4] | |
| uses: ./.github/workflows/run_tests_internal.yml | |
| with: | |
| device_type: cpu | |
| device_name: X64 | |
| image_type: tpu | |
| pytest_marker: 'cpu_only' | |
| xla_python_client_mem_fraction: 0.75 | |
| tf_force_gpu_allow_growth: false | |
| container_resource_option: "--privileged" | |
| is_scheduled_run: ${{ github.event_name == 'schedule' }} | |
| worker_group: ${{ matrix.worker_group }} | |
| total_workers: 4 | |
| tpu_unit_tests: | |
| needs: tpu_image | |
| uses: ./.github/workflows/run_tests_internal.yml | |
| with: | |
| device_type: tpu | |
| device_name: v4-8 | |
| cloud_runner: linux-x86-ct4p-240-4tpu | |
| pytest_marker: 'not cpu_only and not gpu_only and not integration_test' | |
| xla_python_client_mem_fraction: 0.75 | |
| tf_force_gpu_allow_growth: false | |
| container_resource_option: "--privileged" | |
| is_scheduled_run: ${{ github.event_name == 'schedule' }} | |
| tpu_integration_tests: | |
| needs: tpu_image | |
| uses: ./.github/workflows/run_tests_internal.yml | |
| with: | |
| device_type: tpu | |
| device_name: v4-8 | |
| cloud_runner: linux-x86-ct4p-240-4tpu | |
| pytest_marker: 'not cpu_only and not gpu_only and integration_test' | |
| xla_python_client_mem_fraction: 0.75 | |
| tf_force_gpu_allow_growth: false | |
| container_resource_option: "--privileged" | |
| is_scheduled_run: ${{ github.event_name == 'schedule' }} | |
| gpu_unit_tests: | |
| needs: gpu_image | |
| uses: ./.github/workflows/run_tests_internal.yml | |
| with: | |
| device_type: gpu | |
| device_name: a100-40gb-4 | |
| cloud_runner: linux-x86-a2-48-a100-4gpu | |
| pytest_marker: 'not cpu_only and not tpu_only and not integration_test' | |
| pytest_addopts: '--ignore=tests/sft_hooks_test.py' | |
| xla_python_client_mem_fraction: 0.65 | |
| tf_force_gpu_allow_growth: true | |
| container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" | |
| is_scheduled_run: ${{ github.event_name == 'schedule' }} | |
| gpu_integration_tests: | |
| needs: gpu_image | |
| uses: ./.github/workflows/run_tests_internal.yml | |
| with: | |
| device_type: gpu | |
| device_name: a100-40gb-4 | |
| cloud_runner: linux-x86-a2-48-a100-4gpu | |
| pytest_marker: 'not cpu_only and not tpu_only and integration_test' | |
| pytest_addopts: '--ignore=tests/sft_hooks_test.py' | |
| xla_python_client_mem_fraction: 0.65 | |
| tf_force_gpu_allow_growth: true | |
| container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" | |
| is_scheduled_run: ${{ github.event_name == 'schedule' }} | |
| clean_up: | |
| if: ${{ always() }} | |
| needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests] | |
| name: "Clean up" | |
| runs-on: ["self-hosted"] | |
| permissions: | |
| contents: read | |
| issues: write | |
| steps: | |
| - name: Authenticate gcloud | |
| run: | | |
| # configure registries as root and as runner | |
| gcloud auth configure-docker --quiet | |
| gcloud auth configure-docker us-docker.pkg.dev --quiet | |
| - name: Delete the tpu image | |
| run: gcloud container images delete "gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu" --force-delete-tags --quiet | |
| - name: Delete the gpu image | |
| run: gcloud container images delete "gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu" --force-delete-tags --quiet | |
| notify_failure: | |
| name: Notify failed build # creates an issue or modifies last open existing issue for failed build | |
| needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests] | |
| if: ${{ always() }} | |
| runs-on: ubuntu-latest | |
| permissions: | |
| issues: write | |
| steps: | |
| - name: Check whether one of the jobs failed | |
| id: report_failure | |
| if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }} | |
| uses: jayqi/[email protected] | |
| with: | |
| github-token: ${{ secrets.GITHUB_TOKEN }} | |
| labels: "failed-build" | |
| - name: Reset consecutive success counter on failure | |
| # This runs only if the previous step actually found or created an issue | |
| if: ${{ steps.report_failure.outputs.issue-number != '' }} | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GH_REPO: ${{ github.repository }} | |
| # Use the issue number from the previous step's output | |
| ISSUE_NUMBER: ${{ steps.report_failure.outputs.issue-number }} | |
| run: | | |
| echo "A failure occurred. Resetting success counter on issue #${ISSUE_NUMBER}." | |
| # This command will attempt to remove both labels. | |
| # It will not fail if the labels don't exist. | |
| gh issue remove-label $ISSUE_NUMBER "success-run-1" "success-run-2" --repo $GH_REPO || echo "No success labels to remove." | |
| notify_success_and_close: | |
| name: Close issue after 3 successful builds | |
| # This job runs only if all the preceding test jobs succeeded | |
| if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }} | |
| needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests] | |
| runs-on: ubuntu-latest | |
| permissions: | |
| issues: write | |
| steps: | |
| - name: Find existing failure issue | |
| id: find_issue | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GH_REPO: ${{ github.repository }} | |
| run: | | |
| ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number') | |
| if [[ -z "$ISSUE_NUMBER" ]]; then | |
| echo "No open build failure issue found. Nothing to do." | |
| echo "issue_number=" >> $GITHUB_OUTPUT | |
| else | |
| echo "Found open build failure issue: #${ISSUE_NUMBER}" | |
| echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Add success label or close issue | |
| if: steps.find_issue.outputs.issue_number != '' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GH_REPO: ${{ github.repository }} | |
| run: | | |
| ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }} | |
| LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name') | |
| if echo "$LABELS" | grep -q "success-run-2"; then | |
| echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}." | |
| gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically." | |
| gh issue close $ISSUE_NUMBER | |
| # Clean up all tracking labels | |
| gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO | |
| elif echo "$LABELS" | grep -q "success-run-1"; then | |
| echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}." | |
| gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue." | |
| gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO | |
| gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO | |
| else | |
| echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}." | |
| gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs." | |
| gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO | |
| fi |