Skip to content

Tests

Tests #1858

Workflow file for this run

# Copyright 2023–2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Tests
on:
pull_request:
workflow_dispatch:
schedule:
# Run the job every 4 hours
- cron: '0 */4 * * *'
concurrency:
# Dedup pull requests (canceling previous runs of the same workflow for same PR), and scheduled runs but nothing else
group: >
${{
github.event_name == 'pull_request' && format('{0}-pr-{1}', github.workflow, github.event.pull_request.number) ||
github.event_name == 'schedule' && format('{0}-schedule', github.workflow) ||
github.run_id
}}
cancel-in-progress: true
jobs:
prelim:
runs-on: ["self-hosted"]
steps:
- name: Test gsutil installation
run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
- name: Cleanup old docker images
run: docker system prune --all --force
tpu_image:
needs: prelim
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: tpu
device_name: v4-8
cloud_runner: linux-x86-n2-16-buildkit
build_mode: jax_ai_image
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
gpu_image:
needs: prelim
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
cloud_runner: linux-x86-n2-16-buildkit
build_mode: jax_ai_image
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
cpu_unit_tests:
needs: tpu_image
strategy:
fail-fast: false
matrix:
worker_group: [1, 2, 3, 4]
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: cpu
device_name: X64
image_type: tpu
pytest_marker: 'cpu_only'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
worker_group: ${{ matrix.worker_group }}
total_workers: 4
tpu_unit_tests:
needs: tpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: tpu
device_name: v4-8
cloud_runner: linux-x86-ct4p-240-4tpu
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
tpu_integration_tests:
needs: tpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: tpu
device_name: v4-8
cloud_runner: linux-x86-ct4p-240-4tpu
pytest_marker: 'not cpu_only and not gpu_only and integration_test'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
gpu_unit_tests:
needs: gpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
pytest_addopts: '--ignore=tests/sft_hooks_test.py'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
gpu_integration_tests:
needs: gpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
pytest_addopts: '--ignore=tests/sft_hooks_test.py'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
clean_up:
if: ${{ always() }}
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
name: "Clean up"
runs-on: ["self-hosted"]
permissions:
contents: read
issues: write
steps:
- name: Authenticate gcloud
run: |
# configure registries as root and as runner
gcloud auth configure-docker --quiet
gcloud auth configure-docker us-docker.pkg.dev --quiet
- name: Delete the tpu image
run: gcloud container images delete "gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu" --force-delete-tags --quiet
- name: Delete the gpu image
run: gcloud container images delete "gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu" --force-delete-tags --quiet
notify_failure:
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
if: ${{ always() }}
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- name: Check whether one of the jobs failed
id: report_failure
if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
uses: jayqi/[email protected]
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
labels: "failed-build"
- name: Reset consecutive success counter on failure
# This runs only if the previous step actually found or created an issue
if: ${{ steps.report_failure.outputs.issue-number != '' }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
# Use the issue number from the previous step's output
ISSUE_NUMBER: ${{ steps.report_failure.outputs.issue-number }}
run: |
echo "A failure occurred. Resetting success counter on issue #${ISSUE_NUMBER}."
# This command will attempt to remove both labels.
# It will not fail if the labels don't exist.
gh issue remove-label $ISSUE_NUMBER "success-run-1" "success-run-2" --repo $GH_REPO || echo "No success labels to remove."
notify_success_and_close:
name: Close issue after 3 successful builds
# This job runs only if all the preceding test jobs succeeded
if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- name: Find existing failure issue
id: find_issue
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
if [[ -z "$ISSUE_NUMBER" ]]; then
echo "No open build failure issue found. Nothing to do."
echo "issue_number=" >> $GITHUB_OUTPUT
else
echo "Found open build failure issue: #${ISSUE_NUMBER}"
echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
fi
- name: Add success label or close issue
if: steps.find_issue.outputs.issue_number != ''
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')
if echo "$LABELS" | grep -q "success-run-2"; then
echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
gh issue close $ISSUE_NUMBER
# Clean up all tracking labels
gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
elif echo "$LABELS" | grep -q "success-run-1"; then
echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
else
echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
fi