AI-Hypercomputer · JulieKuo · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
@@ -53,64 +53,77 @@ jobs:
       build_mode: jax_ai_image
       base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
 
-  gpu_image:
-    needs: prelim
-    uses: ./.github/workflows/build_upload_internal.yml
-    with:
-      device_type: gpu
-      device_name: a100-40gb-4
-      cloud_runner: linux-x86-n2-16-buildkit
-      build_mode: jax_ai_image
-      base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
+  # gpu_image:
+  #   needs: prelim
+  #   uses: ./.github/workflows/build_upload_internal.yml
+  #   with:
+  #     device_type: gpu
+  #     device_name: a100-40gb-4
+  #     cloud_runner: linux-x86-n2-16-buildkit
+  #     build_mode: jax_ai_image
+  #     base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/gpu:latest
 
-  cpu_unit_tests:
-    needs: tpu_image
-    strategy:
-      fail-fast: false
-      matrix:
-        worker_group: [1, 2, 3, 4]
-    uses: ./.github/workflows/run_tests_internal.yml
-    with:
-      device_type: cpu
-      device_name: X64
-      image_type: tpu
-      pytest_marker: 'cpu_only'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
-      worker_group: ${{ matrix.worker_group }}
-      total_workers: 4
+  # cpu_unit_tests:
+  #   needs: tpu_image
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       worker_group: [1, 2, 3, 4]
+  #   uses: ./.github/workflows/run_tests_internal.yml
+  #   with:
+  #     device_type: cpu
+  #     device_name: X64
+  #     image_type: tpu
+  #     pytest_marker: 'cpu_only'
+  #     xla_python_client_mem_fraction: 0.75
+  #     tf_force_gpu_allow_growth: false
+  #     container_resource_option: "--privileged"
+  #     is_scheduled_run: ${{ github.event_name == 'schedule' }}
+  #     worker_group: ${{ matrix.worker_group }}
+  #     total_workers: 4
 
-  tpu_unit_tests:
-    needs: tpu_image
-    uses: ./.github/workflows/run_tests_internal.yml
-    with:
-      device_type: tpu
-      device_name: v4-8
-      cloud_runner: linux-x86-ct4p-240-4tpu
-      pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+  # tpu_unit_tests:
+  #   needs: tpu_image
+  #   uses: ./.github/workflows/run_tests_internal.yml
+  #   with:
+  #     device_type: tpu
+  #     device_name: v4-8
+  #     cloud_runner: linux-x86-ct4p-240-4tpu
+  #     pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
+  #     xla_python_client_mem_fraction: 0.75
+  #     tf_force_gpu_allow_growth: false
+  #     container_resource_option: "--privileged"
+  #     is_scheduled_run: ${{ github.event_name == 'schedule' }}
 
-  tpu_pathways_unit_tests:
-    needs: tpu_image
-    uses: ./.github/workflows/run_pathways_tests_internal.yml
-    with:
-      device_type: tpu
-      device_name: v4-8
-      cloud_runner: linux-x86-ct4p-240-4tpu
-      pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
-      xla_python_client_mem_fraction: 0.75
-      tf_force_gpu_allow_growth: false
-      container_resource_option: "--privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+  # tpu_pathways_unit_tests:
+  #   needs: tpu_image
+  #   uses: ./.github/workflows/run_pathways_tests_internal.yml
+  #   with:
+  #     device_type: tpu
+  #     device_name: v4-8
+  #     cloud_runner: linux-x86-ct4p-240-4tpu
+  #     pytest_marker: 'not cpu_only and not gpu_only and not integration_test'
+  #     xla_python_client_mem_fraction: 0.75
+  #     tf_force_gpu_allow_growth: false
+  #     container_resource_option: "--privileged"
+  #     is_scheduled_run: ${{ github.event_name == 'schedule' }}
+
+  # tpu_integration_tests:
+  #   needs: tpu_image
+  #   uses: ./.github/workflows/run_tests_internal.yml
+  #   with:
+  #     device_type: tpu
+  #     device_name: v4-8
+  #     cloud_runner: linux-x86-ct4p-240-4tpu
+  #     pytest_marker: 'not cpu_only and not gpu_only and integration_test'
+  #     xla_python_client_mem_fraction: 0.75
+  #     tf_force_gpu_allow_growth: false
+  #     container_resource_option: "--privileged"
+  #     is_scheduled_run: ${{ github.event_name == 'schedule' }}
 
-  tpu_integration_tests:
+  tpu_pathways_integration_tests:
     needs: tpu_image
-    uses: ./.github/workflows/run_tests_internal.yml
+    uses: ./.github/workflows/run_pathways_tests_internal.yml
     with:
       device_type: tpu
       device_name: v4-8
@@ -121,37 +134,38 @@ jobs:
       container_resource_option: "--privileged"
       is_scheduled_run: ${{ github.event_name == 'schedule' }}
 
-  gpu_unit_tests:
-    needs: gpu_image
-    uses: ./.github/workflows/run_tests_internal.yml
-    with:
-      device_type: gpu
-      device_name: a100-40gb-4
-      cloud_runner: linux-x86-a2-48-a100-4gpu
-      pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
-      pytest_addopts: '--ignore=tests/sft_hooks_test.py'
-      xla_python_client_mem_fraction: 0.65
-      tf_force_gpu_allow_growth: true
-      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+  # gpu_unit_tests:
+  #   needs: gpu_image
+  #   uses: ./.github/workflows/run_tests_internal.yml
+  #   with:
+  #     device_type: gpu
+  #     device_name: a100-40gb-4
+  #     cloud_runner: linux-x86-a2-48-a100-4gpu
+  #     pytest_marker: 'not cpu_only and not tpu_only and not integration_test'
+  #     pytest_addopts: '--ignore=tests/sft_hooks_test.py'
+  #     xla_python_client_mem_fraction: 0.65
+  #     tf_force_gpu_allow_growth: true
+  #     container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+  #     is_scheduled_run: ${{ github.event_name == 'schedule' }}
 
-  gpu_integration_tests:
-    needs: gpu_image
-    uses: ./.github/workflows/run_tests_internal.yml
-    with:
-      device_type: gpu
-      device_name: a100-40gb-4
-      cloud_runner: linux-x86-a2-48-a100-4gpu
-      pytest_marker: 'not cpu_only and not tpu_only and integration_test'
-      pytest_addopts: '--ignore=tests/sft_hooks_test.py'
-      xla_python_client_mem_fraction: 0.65
-      tf_force_gpu_allow_growth: true
-      container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
-      is_scheduled_run: ${{ github.event_name == 'schedule' }}
+  # gpu_integration_tests:
+  #   needs: gpu_image
+  #   uses: ./.github/workflows/run_tests_internal.yml
+  #   with:
+  #     device_type: gpu
+  #     device_name: a100-40gb-4
+  #     cloud_runner: linux-x86-a2-48-a100-4gpu
+  #     pytest_marker: 'not cpu_only and not tpu_only and integration_test'
+  #     pytest_addopts: '--ignore=tests/sft_hooks_test.py'
+  #     xla_python_client_mem_fraction: 0.65
+  #     tf_force_gpu_allow_growth: true
+  #     container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+  #     is_scheduled_run: ${{ github.event_name == 'schedule' }}
 
   clean_up:
     if: ${{ always() }}
-    needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
+    needs: [tpu_pathways_integration_tests]
+    # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
     name: "Clean up"
     runs-on: ["self-hosted"]
     permissions:
@@ -170,7 +184,8 @@ jobs:
 
   notify_failure:
     name: Notify failed build # creates an issue or modifies last open existing issue for failed build
-    needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
+    needs: [tpu_pathways_integration_tests]
+    # needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests, tpu_pathways_integration_tests]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     permissions:
@@ -198,52 +213,52 @@ jobs:
         # It will not fail if the labels don't exist.
         gh issue remove-label $ISSUE_NUMBER "success-run-1" "success-run-2" --repo $GH_REPO || echo "No success labels to remove."
 
-  notify_success_and_close:
-      name: Close issue after 3 successful builds
-      # This job runs only if all the preceding test jobs succeeded
-      if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
-      needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
-      runs-on: ubuntu-latest
-      permissions:
-        issues: write
-      steps:
-        - name: Find existing failure issue
-          id: find_issue
-          env:
-            GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-            GH_REPO: ${{ github.repository }}
-          run: |
-            ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
-            if [[ -z "$ISSUE_NUMBER" ]]; then
-              echo "No open build failure issue found. Nothing to do."
-              echo "issue_number=" >> $GITHUB_OUTPUT
-            else
-              echo "Found open build failure issue: #${ISSUE_NUMBER}"
-              echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
-            fi
-
-        - name: Add success label or close issue
-          if: steps.find_issue.outputs.issue_number != ''
-          env:
-            GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-            GH_REPO: ${{ github.repository }}
-          run: |
-            ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
-            LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')
-
-            if echo "$LABELS" | grep -q "success-run-2"; then
-              echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
-              gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
-              gh issue close $ISSUE_NUMBER
-              # Clean up all tracking labels
-              gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
-            elif echo "$LABELS" | grep -q "success-run-1"; then
-              echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
-              gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
-              gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
-              gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
-            else
-              echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
-              gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
-              gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
-            fi
+  # notify_success_and_close:
+  #     name: Close issue after 3 successful builds
+  #     # This job runs only if all the preceding test jobs succeeded
+  #     if: ${{ success() && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }}
+  #     needs: [cpu_unit_tests, gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests, tpu_pathways_unit_tests]
+  #     runs-on: ubuntu-latest
+  #     permissions:
+  #       issues: write
+  #     steps:
+  #       - name: Find existing failure issue
+  #         id: find_issue
+  #         env:
+  #           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #           GH_REPO: ${{ github.repository }}
+  #         run: |
+  #           ISSUE_NUMBER=$(gh issue list --label "failed-build" --state open --limit 1 --json number -q '.[0].number')
+  #           if [[ -z "$ISSUE_NUMBER" ]]; then
+  #             echo "No open build failure issue found. Nothing to do."
+  #             echo "issue_number=" >> $GITHUB_OUTPUT
+  #           else
+  #             echo "Found open build failure issue: #${ISSUE_NUMBER}"
+  #             echo "issue_number=${ISSUE_NUMBER}" >> $GITHUB_OUTPUT
+  #           fi
+
+  #       - name: Add success label or close issue
+  #         if: steps.find_issue.outputs.issue_number != ''
+  #         env:
+  #           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  #           GH_REPO: ${{ github.repository }}
+  #         run: |
+  #           ISSUE_NUMBER=${{ steps.find_issue.outputs.issue_number }}
+  #           LABELS=$(gh issue view $ISSUE_NUMBER --json labels -q '.labels[].name')
+
+  #           if echo "$LABELS" | grep -q "success-run-2"; then
+  #             echo "Third consecutive success. Closing issue #${ISSUE_NUMBER}."
+  #             gh issue comment $ISSUE_NUMBER --body "Build succeeded for the third consecutive time. Closing this issue automatically."
+  #             gh issue close $ISSUE_NUMBER
+  #             # Clean up all tracking labels
+  #             gh issue remove-label $ISSUE_NUMBER "failed-build" "success-run-2" --repo $GH_REPO
+  #           elif echo "$LABELS" | grep -q "success-run-1"; then
+  #             echo "Second consecutive success. Updating label on issue #${ISSUE_NUMBER}."
+  #             gh issue comment $ISSUE_NUMBER --body "Build succeeded for the second time. One more successful run will close this issue."
+  #             gh issue remove-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
+  #             gh issue add-label $ISSUE_NUMBER "success-run-2" --repo $GH_REPO
+  #           else
+  #             echo "First consecutive success since failure. Adding label to issue #${ISSUE_NUMBER}."
+  #             gh issue comment $ISSUE_NUMBER --body "Build succeeded. This issue will be auto-closed after two more consecutive successful runs."
+  #             gh issue add-label $ISSUE_NUMBER "success-run-1" --repo $GH_REPO
+  #           fi
@@ -64,6 +64,7 @@ jobs:
         IFRT_PROXY_USE_INSECURE_GRPC_CREDENTIALS: true
         JAX_PLATFORMS: "proxy"
         JAX_BACKEND_TARGET: "grpc://localhost:29000"
+        JAX_COORDINATOR_ADDRESS: "localhost"
       options: ${{ inputs.container_resource_option }}
     steps:
       - uses: actions/checkout@v4

@@ -161,6 +161,7 @@ def maybe_initialize_jax_distributed_system(raw_keys):
 
   For CPUs, we call jax.distributed.initialize() explicitly, with the specified arguments.
   """
+  print(f"LOG: maybe_initialize_jax_distributed_system - {raw_keys = }")
   if raw_keys["skip_jax_distributed_system"]:
     max_logging.log("Skipping jax distributed system due to skip_jax_distributed_system=True flag.")
     return

@@ -85,6 +85,18 @@ def run_checkpointing(hardware, attention_type):
       "grain_worker_count=0",
       "grain_train_files=/tmp/gcsfuse/array-record/c4/en/3.0.1/c4-train.array_record*",
   ]
+
+  command = get_checkpointing_command(
+      run_date,
+      hardware=hardware,
+      steps=1,
+      metrics_file="saved_metrics.txt",
+      attention_type=attention_type,
+      dataset_type="grain",
+      dataset_path="/tmp/gcsfuse",
+  ) + grain_command
+  print(f"LOG: {command = }")
+
   train_main(
       get_checkpointing_command(
           run_date,