Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 145 additions & 130 deletions .github/workflows/RunTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,64 +53,77 @@ jobs:
build_mode: jax_ai_image
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest

gpu_image:
needs: prelim
uses: ./.github/workflows/build_upload_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
cloud_runner: linux-x86-n2-16-buildkit
build_mode: jax_ai_image
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
# gpu_image:
# needs: prelim
# uses: ./.github/workflows/build_upload_internal.yml
# with:
# device_type: gpu
# device_name: a100-40gb-4
# cloud_runner: linux-x86-n2-16-buildkit
# build_mode: jax_ai_image
# base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest

cpu_unit_tests:
needs: tpu_image
strategy:
fail-fast: false
matrix:
worker_group: [1, 2, 3, 4]
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: cpu
device_name: X64
image_type: tpu
pytest_marker: 'cpu_only'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
worker_group: ${{ matrix.worker_group }}
total_workers: 4
# cpu_unit_tests:
# needs: tpu_image
# strategy:
# fail-fast: false
# matrix:
# worker_group: [1, 2, 3, 4]
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: cpu
# device_name: X64
# image_type: tpu
# pytest_marker: 'cpu_only'
# xla_python_client_mem_fraction: 0.75
# tf_force_gpu_allow_growth: false
# container_resource_option: "--privileged"
# is_scheduled_run: ${{ github.event_name == 'schedule' }}
# worker_group: ${{ matrix.worker_group }}
# total_workers: 4

tpu_unit_tests:
needs: tpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: tpu
device_name: v4-8
cloud_runner: linux-x86-ct4p-240-4tpu
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
# tpu_unit_tests:
# needs: tpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: tpu
# device_name: v4-8
# cloud_runner: linux-x86-ct4p-240-4tpu
# pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
# xla_python_client_mem_fraction: 0.75
# tf_force_gpu_allow_growth: false
# container_resource_option: "--privileged"
# is_scheduled_run: ${{ github.event_name == 'schedule' }}

tpu_pathways_unit_tests:
needs: tpu_image
uses: ./.github/workflows/run_pathways_tests_internal.yml
with:
device_type: tpu
device_name: v4-8
cloud_runner: linux-x86-ct4p-240-4tpu
pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
xla_python_client_mem_fraction: 0.75
tf_force_gpu_allow_growth: false
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
# tpu_pathways_unit_tests:
# needs: tpu_image
# uses: ./.github/workflows/run_pathways_tests_internal.yml
# with:
# device_type: tpu
# device_name: v4-8
# cloud_runner: linux-x86-ct4p-240-4tpu
# pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
# xla_python_client_mem_fraction: 0.75
# tf_force_gpu_allow_growth: false
# container_resource_option: "--privileged"
# is_scheduled_run: ${{ github.event_name == 'schedule' }}

# tpu_integration_tests:
# needs: tpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: tpu
# device_name: v4-8
# cloud_runner: linux-x86-ct4p-240-4tpu
# pytest_marker: 'not cpu_only and not gpu_only and integration_test'
# xla_python_client_mem_fraction: 0.75
# tf_force_gpu_allow_growth: false
# container_resource_option: "--privileged"
# is_scheduled_run: ${{ github.event_name == 'schedule' }}

tpu_integration_tests:
tpu_pathways_integration_tests:
needs: tpu_image
uses: ./.github/workflows/run_tests_internal.yml
uses: ./.github/workflows/run_pathways_tests_internal.yml
with:
device_type: tpu
device_name: v4-8
Expand All @@ -121,37 +134,38 @@ jobs:
container_resource_option: "--privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}

gpu_unit_tests:
needs: gpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
pytest_addopts: '--ignore=tests/sft_hooks_test.py'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
# gpu_unit_tests:
# needs: gpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: gpu
# device_name: a100-40gb-4
# cloud_runner: linux-x86-a2-48-a100-4gpu
# pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
# pytest_addopts: '--ignore=tests/sft_hooks_test.py'
# xla_python_client_mem_fraction: 0.65
# tf_force_gpu_allow_growth: true
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
# is_scheduled_run: ${{ github.event_name == 'schedule' }}

gpu_integration_tests:
needs: gpu_image
uses: ./.github/workflows/run_tests_internal.yml
with:
device_type: gpu
device_name: a100-40gb-4
cloud_runner: linux-x86-a2-48-a100-4gpu
pytest_marker: 'not cpu_only and not tpu_only and integration_test'
pytest_addopts: '--ignore=tests/sft_hooks_test.py'
xla_python_client_mem_fraction: 0.65
tf_force_gpu_allow_growth: true
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
is_scheduled_run: ${{ github.event_name == 'schedule' }}
# gpu_integration_tests:
# needs: gpu_image
# uses: ./.github/workflows/run_tests_internal.yml
# with:
# device_type: gpu
# device_name: a100-40gb-4
# cloud_runner: linux-x86-a2-48-a100-4gpu
# pytest_marker: 'not cpu_only and not tpu_only and integration_test'
# pytest_addopts: '--ignore=tests/sft_hooks_test.py'
# xla_python_client_mem_fraction: 0.65
# tf_force_gpu_allow_growth: true
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
# is_scheduled_run: ${{ github.event_name == 'schedule' }}

clean_up:
if: ${{ always() }}
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
needs: [tpu_pathways_integration_tests]
# needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
name: "Clean up"
runs-on: ["self-hosted"]
permissions:
Expand All @@ -170,7 +184,8 @@ jobs:

notify_failure:
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
needs: [tpu_pathways_integration_tests]
# needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
if: ${{ always() }}
runs-on: ubuntu-latest
permissions:
Expand Down Expand Up @@ -198,52 +213,52 @@ jobs:
# It will not fail if the labels don't exist.
gh issue remove-label $ISSUE_NUMBER "success-run-1" "success-run-2" --repo $GH_REPO || echo "No success labels to remove."

notify_success_and_close:
name: Close issue after 3 successful builds
# This job runs only if all the preceding test jobs succeeded
if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- name: Find existing failure issue
id: find_issue
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
if [[ -z "$ISSUE_NUMBER" ]]; then
echo "No open build failure issue found. Nothing to do."
echo "issue_number=" >> $GITHUB_OUTPUT
else
echo "Found open build failure issue: #${ISSUE_NUMBER}"
echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
fi

- name: Add success label or close issue
if: steps.find_issue.outputs.issue_number != ''
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_REPO: ${{ github.repository }}
run: |
ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')

if echo "$LABELS" | grep -q "success-run-2"; then
echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
gh issue close $ISSUE_NUMBER
# Clean up all tracking labels
gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
elif echo "$LABELS" | grep -q "success-run-1"; then
echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
else
echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
fi
# notify_success_and_close:
# name: Close issue after 3 successful builds
# # This job runs only if all the preceding test jobs succeeded
# if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
# needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
# runs-on: ubuntu-latest
# permissions:
# issues: write
# steps:
# - name: Find existing failure issue
# id: find_issue
# env:
# GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# GH_REPO: ${{ github.repository }}
# run: |
# ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
# if [[ -z "$ISSUE_NUMBER" ]]; then
# echo "No open build failure issue found. Nothing to do."
# echo "issue_number=" >> $GITHUB_OUTPUT
# else
# echo "Found open build failure issue: #${ISSUE_NUMBER}"
# echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
# fi

# - name: Add success label or close issue
# if: steps.find_issue.outputs.issue_number != ''
# env:
# GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# GH_REPO: ${{ github.repository }}
# run: |
# ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
# LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')

# if echo "$LABELS" | grep -q "success-run-2"; then
# echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
# gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
# gh issue close $ISSUE_NUMBER
# # Clean up all tracking labels
# gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
# elif echo "$LABELS" | grep -q "success-run-1"; then
# echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
# gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
# gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
# gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
# else
# echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
# gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
# gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
# fi
1 change: 1 addition & 0 deletions .github/workflows/run_pathways_tests_internal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS: true
JAX_PLATFORMS: "proxy"
JAX_BACKEND_TARGET: "grpc://localhost:29000"
JAX_COORDINATOR_ADDRESS: "localhost"
options: ${{ inputs.container_resource_option }}
steps:
- uses: actions/checkout@v4
Expand Down
1 change: 1 addition & 0 deletions src/MaxText/max_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def maybe_initialize_jax_distributed_system(raw_keys):

For CPUs, we call jax.distributed.initialize() explicitly, with the specified arguments.
"""
print(f"LOG: maybe_initialize_jax_distributed_system - {raw_keys = }")
if raw_keys["skip_jax_distributed_system"]:
max_logging.log("Skipping jax distributed system due to skip_jax_distributed_system=True flag.")
return
Expand Down
12 changes: 12 additions & 0 deletions tests/integration_tests/checkpointing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,18 @@ def run_checkpointing(hardware, attention_type):
"grain_worker_count=0",
"grain_train_files=/tmp/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record*",
]

command = get_checkpointing_command(
run_date,
hardware=hardware,
steps=1,
metrics_file="saved_metrics.txt",
attention_type=attention_type,
dataset_type="grain",
dataset_path="/tmp/gcsfuse",
) + grain_command
print(f"LOG: {command = }")

train_main(
get_checkpointing_command(
run_date,
Expand Down