diff --git a/.github/container/Dockerfile.mjx b/.github/container/Dockerfile.mjx deleted file mode 100644 index 1bdd1a439..000000000 --- a/.github/container/Dockerfile.mjx +++ /dev/null @@ -1,54 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_MUJOCO=https://github.com/google-deepmind/mujoco.git#main -ARG URLREF_MUJOCO_MPC=https://github.com/google-deepmind/mujoco_mpc.git#main -ARG URLREF_L2R=https://github.com/google-deepmind/language_to_reward_2023.git#main -ARG SRC_PATH_MUJOCO=/opt/mujoco -ARG SRC_PATH_MUJOCO_MPC=/opt/mujoco-mpc -ARG SRC_PATH_L2R=/opt/language-to-reward-2023 - -############################################################################### -## Download source and add auxiliary scripts -############################################################################### - -FROM ${BASE_IMAGE} as mealkit -ARG URLREF_MUJOCO -ARG URLREF_MUJOCO_MPC -ARG URLREF_L2R -ARG SRC_PATH_MUJOCO -ARG SRC_PATH_MUJOCO_MPC -ARG SRC_PATH_L2R - -# Install system dependencies for Mujuco/MPC -RUN <<"EOF" bash -ex -apt-get update -apt-get install -y \ - libgl1-mesa-dev \ - libxinerama-dev \ - libxcursor-dev \ - libxrandr-dev \ - libxi-dev \ - ninja-build -apt-get clean -rm -rf /var/lib/apt/lists/* -EOF - -# Specify installation targets -RUN <<"EOF" bash -ex -git-clone.sh ${URLREF_MUJOCO} ${SRC_PATH_MUJOCO} -git-clone.sh ${URLREF_MUJOCO_MPC} ${SRC_PATH_MUJOCO_MPC} -git-clone.sh ${URLREF_L2R} ${SRC_PATH_L2R} -echo "-f https://py.mujoco.org/" >> /opt/pip-tools.d/requirements-mjx.in -echo "-e file://${SRC_PATH_MUJOCO}/mjx" >> /opt/pip-tools.d/requirements-mjx.in -echo "-e file://${SRC_PATH_MUJOCO_MPC}/python" >> /opt/pip-tools.d/requirements-l2r.in -echo "-e file://${SRC_PATH_L2R}" >> /opt/pip-tools.d/requirements-l2r.in -EOF - -############################################################################### -## Install accumulated packages from the base image and the previous stage -############################################################################### - -FROM mealkit as final - -RUN pip-finalize.sh diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index ab2a2f9d7..9746e771a 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -86,21 +86,6 @@ haliax: tracking_ref: main latest_verified_commit: 2a696a0c971901ff93afdaa965959d8e3b982ba9 mode: git-clone -mujoco: - url: https://github.com/google-deepmind/mujoco.git - tracking_ref: main - latest_verified_commit: e95159b4f6d48d114b16a8dc13ad26b3e44bc3e2 - mode: git-clone -mujoco-mpc: - url: https://github.com/google-deepmind/mujoco_mpc.git - tracking_ref: main - latest_verified_commit: 4700f4a13be18398f5aaf6a33ed42e531967e3ae - mode: git-clone -language-to-reward-2023: - url: https://github.com/google-deepmind/language_to_reward_2023.git - tracking_ref: main - latest_verified_commit: abb8e5125e4ecd0da378490b73448c05a694def5 - mode: git-clone mlperf-logging: url: https://github.com/mlcommons/logging.git tracking_ref: master diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 7d562a84b..c5bc64ac3 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -543,14 +543,6 @@ jobs: pytest-report-L0-unittest.jsonl pytest-report-L0-distributed-unittest.jsonl pytest-report-L1-distributed-unittest.jsonl - test-upstream-t5x: - needs: build-upstream-t5x - if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a - uses: ./.github/workflows/_test_upstream_t5x.yaml - with: - T5X_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - test-rosetta-t5x: needs: build-rosetta-t5x if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 @@ -569,7 +561,7 @@ jobs: docker run -i --gpus all --shm-size=1g \ ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \ bash <<"EOF" |& tee test-levanter.log - pip install flake8 pytest soundfile librosa + pip install flake8 pytest pytest-asyncio soundfile tensorboardx librosa PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests -m "not entry and not slow and not ray" EOF STATISTICS_SCRIPT: | diff --git a/.github/workflows/_test_maxtext.yaml b/.github/workflows/_test_maxtext.yaml index f7a157878..5b1634f9a 100644 --- a/.github/workflows/_test_maxtext.yaml +++ b/.github/workflows/_test_maxtext.yaml @@ -373,24 +373,6 @@ jobs: ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }} FW_NAME: ${{ inputs.FW_NAME }} - summary: - name: test-maxtext-summary - runs-on: ubuntu-22.04 - needs: [single-process-multi-device, maxtext-multinode] - if: "!cancelled()" - steps: - - name: Generate TensorBoard query URL - run: | - ( - cat << EOF - - ## MaxText training - - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) - - EOF - ) | tee $GITHUB_STEP_SUMMARY - outcome: name: test-maxtext-outcome needs: sitrep diff --git a/.github/workflows/_test_t5x_rosetta.yaml b/.github/workflows/_test_t5x_rosetta.yaml index cc1a23179..df07e31e4 100644 --- a/.github/workflows/_test_t5x_rosetta.yaml +++ b/.github/workflows/_test_t5x_rosetta.yaml @@ -29,568 +29,15 @@ on: value: ${{ jobs.sitrep.outputs.STATUS }} env: - BATCH_SIZE_PER_GPU: 32 VIT_BATCH_SIZE_PER_GPU: 256 jobs: - - single-process-multi-device: - strategy: - max-parallel: 1 - matrix: - include: - # - TEST_NAME: "1P1G_te-0" - # N_GPU: 1 - # ADDITIONAL_ARGS: "--enable-te 0" - # EXTRA_GIN_ARGS: "" - - TEST_NAME: "1P8G_te-1" - N_GPU: 8 - ADDITIONAL_ARGS: "" - EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" - fail-fast: false - runs-on: jumpbox - env: - BADGE_FILENAME_PREFIX: badge-rosetta-t5x-single-process-multi-device - steps: - - name: Print environment variables - run: env - - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - - name: Setup SSH - id: setup-ssh - uses: ./.github/actions/setup-ssh - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - - - name: Labels and metadata - id: meta - shell: bash -x -e {0} - run: | - IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_NAME }} - JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} - LOG_FILE=/nfs/cluster/${JOB_NAME}.log - MODEL_PATH=/nfs/cluster/${JOB_NAME} - BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) - for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do - echo "$var=${!var}" >> $GITHUB_OUTPUT - done - - - name: Submit SLURM jobs over SSH - id: submit - shell: bash -O expand_aliases -x -e {0} - run: | - cd $GITHUB_WORKSPACE - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" - sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - JOB=$(sshx sbatch --parsable << EOF - #!/bin/bash - #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - #SBATCH --exclusive - #SBATCH --nodes=1 - #SBATCH --gpus-per-node=${{ matrix.N_GPU }} - #SBATCH --time=00:30:00 - #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" - - # preload enroot container using one task per node - time srun \ - --ntasks-per-node=1 \ - --container-name=runtime \ - --container-image=${{ steps.meta.outputs.IMAGE }} \ - true - - # run job with tasks on each node sharing one container - time srun \ - --ntasks=1 \ - --ntasks-per-node=1 \ - --container-name=runtime \ - --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - --container-entrypoint \ - bash -c 'wget -P /tmp/ https://raw.githubusercontent.com/NVIDIA/JAX-Toolbox/${{ github.sha }}/.github/container/test-t5x.sh && sleep 10 && bash /tmp/test-t5x.sh \ - --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ - --dtype bfloat16 \ - --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ - --epochs 7 \ - --steps-per-epoch 100 \ - --use-contrib-configs \ - ${{ matrix.ADDITIONAL_ARGS }} \ - ${{ matrix.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', matrix.EXTRA_GIN_ARGS) || '' }}' - EOF - ) - - echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT - - . .github/workflows/scripts/wait_for_slurm_job.sh - - wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - - # Gather job info - SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - echo "SLURM Job state is ${SLURM_STATE}" - echo "SLURM Job exit code is ${SLURM_EXITCODE}" - echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - - set -x - - - name: Remove orphaned SLURM job if the CI job is canceled - if: cancelled() - shell: bash -x -e {0} - run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ - scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - - - name: Retrieve training logs and upload to TensorBoard server - shell: bash -x -e {0} - run: | - cd $GITHUB_WORKSPACE - mkdir output/ - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ - output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ - output/ || true - rsync -rtz --progress \ - output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - - - name: Write SLURM job status to file - shell: bash -x -e {0} - run: | - python << EOF - import json - with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - json.dump(dump, f) - EOF - - - name: Generate sitrep - if: success() || failure() - shell: bash -x -e {0} - run: | - # bring in utility functions - cd $GITHUB_WORKSPACE - source .github/workflows/scripts/to_json.sh - - EXIT_STATUSES="output/*-status.json" - badge_label='ROSETTA T5X SINGLE PROCESS MULTI DEVICE ${{ steps.meta.outputs.TEST_CASE_NAME }}' - passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - total_tests=$(ls $EXIT_STATUSES | wc -l) - - if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then - badge_message='error' - badge_color=red - summary="ROSETTA T5X SINGLE PROCESS MULTI DEVICE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message" - else - badge_message="${passed_tests}/${total_tests} passed" - if [[ ${failed_tests} == 0 ]]; then - badge_color=brightgreen - else - badge_color=yellow - fi - summary="ROSETTA T5X SINGLE PROCESS MULTI DEVICE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message" - fi - - to_json \ - summary \ - total_tests passed_tests failed_tests \ - badge_label badge_color badge_message \ - > output/sitrep.json - - schemaVersion=1 \ - label="${badge_label}" \ - message="${badge_message}" \ - color="${badge_color}" \ - to_json schemaVersion label message color \ - > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json - - - name: Upload training logs as artifacts - uses: actions/upload-artifact@v4 - with: - name: ${{ steps.meta.outputs.JOB_NAME }} - path: output/* - - # 2-node configs temporarily disabled - # multi-gpu-multi-node: - # strategy: - # max-parallel: 1 - # matrix: - # include: - # - TEST_NAME: "2N8G-te-1" - # N_GPU: 8 - # N_NODE: 2 - # ADDITIONAL_ARGS: "" - # EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" - # - TEST_NAME: "2N2G_te-0" - # N_GPU: 2 - # N_NODE: 2 - # ADDITIONAL_ARGS: "--enable-te 0" - # EXTRA_GIN_ARGS: "" - # fail-fast: false - # runs-on: jumpbox - # env: - # BADGE_FILENAME_PREFIX: badge-rosetta-t5x-multi-gpu-multi-node - # steps: - # - name: Print environment variables - # run: env - - # - name: Check out the repository under ${GITHUB_WORKSPACE} - # uses: actions/checkout@v4 - - # - name: Setup SSH agent - # uses: webfactory/ssh-agent@v0.9.0 - # with: - # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - # - name: Setup SSH known hosts - # id: ssh-known-hosts - # run: | - # mkdir -p ~/.ssh - # cat >> ~/.ssh/known_hosts << EOF - # ${{ vars.SSH_KNOWN_HOSTS }} - # EOF - # chmod 600 ~/.ssh/known_hosts - # echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - # - name: Labels and metadata - # id: meta - # shell: bash -x -e {0} - # run: | - # IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" - # TEST_CASE_NAME=${{ matrix.TEST_NAME }} - # TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - # JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} - # LOG_FILE=/nfs/cluster/${JOB_NAME}.log - # MODEL_PATH=/nfs/cluster/${JOB_NAME} - # BATCH_SIZE=$((${{ env.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - # for var in IMAGE TEST_CASE_NAME TOTAL_TASKS JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do - # echo "$var=${!var}" >> $GITHUB_OUTPUT - # done - - # - name: Submit SLURM jobs over SSH - # id: submit - # shell: bash -O expand_aliases -x -e {0} - # run: | - # cd $GITHUB_WORKSPACE - # alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - # sshx "date && hostname && sinfo" - # sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - # JOB=$(sshx sbatch --parsable << EOF - # #!/bin/bash - # #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - # #SBATCH --exclusive - # #SBATCH --nodes=${{ matrix.N_NODE }} - # #SBATCH --gpus-per-node=${{ matrix.N_GPU }} - # #SBATCH --time=00:30:00 - # #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - # #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" - - # # preload enroot container using one task per node - # time srun \ - # --ntasks-per-node=1 \ - # --container-name=runtime \ - # --container-image=${{ steps.meta.outputs.IMAGE }} \ - # true - - # # run job with tasks on each node sharing one container - # time srun \ - # --ntasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ - # --ntasks-per-node=${{ matrix.N_GPU }} \ - # --container-name=runtime \ - # --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - # --container-entrypoint \ - # bash -c 'wget -P /tmp/ https://raw.githubusercontent.com/NVIDIA/JAX-Toolbox/${{ github.sha }}/.github/container/test-t5x.sh && sleep 10 && bash /tmp/test-t5x.sh \ - # --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ - # --dtype bfloat16 \ - # --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ - # --epochs 7 \ - # --steps-per-epoch 100 \ - # --multiprocess \ - # --use-contrib-configs \ - # ${{ matrix.ADDITIONAL_ARGS }} \ - # ${{ matrix.EXTRA_GIN_ARGS != '' && format('--additional-args "{0}"', matrix.EXTRA_GIN_ARGS) || '' }}' - # EOF - # ) - - # echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT - - # . .github/workflows/scripts/wait_for_slurm_job.sh - - # wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - - # # Gather job info - # SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - # SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - # echo "SLURM Job state is ${SLURM_STATE}" - # echo "SLURM Job exit code is ${SLURM_EXITCODE}" - # echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - # echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - - # set -x - - # - name: Remove orphaned SLURM job if the CI job is canceled - # if: cancelled() - # shell: bash -x -e {0} - # run: | - # ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ - # scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - - # - name: Retrieve training logs and upload to TensorBoard server - # shell: bash -x -e {0} - # run: | - # cd $GITHUB_WORKSPACE - # mkdir output/ - # rsync -rtz --progress \ - # ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ - # output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - # rsync -rtz --progress \ - # ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ - # output/ || true - # rsync -rtz --progress \ - # output/ \ - # ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - - # - name: Write SLURM job status to file - # shell: bash -x -e {0} - # run: | - # python << EOF - # import json - # with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - # dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - # json.dump(dump, f) - # EOF - - # - name: Generate sitrep - # if: success() || failure() - # shell: bash -x -e {0} - # run: | - # # bring in utility functions - # cd $GITHUB_WORKSPACE - # source .github/workflows/scripts/to_json.sh - - # EXIT_STATUSES="output/*-status.json" - # badge_label='ROSETTA T5X MULTI GPU MULTI NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}' - # passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - # failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - # total_tests=$(ls $EXIT_STATUSES | wc -l) - - # if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then - # badge_message='error' - # badge_color=red - # summary="ROSETTA T5X MULTI GPU MULTI NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message" - # else - # badge_message="${passed_tests}/${total_tests} passed" - # if [[ ${failed_tests} == 0 ]]; then - # badge_color=brightgreen - # else - # badge_color=yellow - # fi - # summary="ROSETTA T5X MULTI GPU MULTI NODE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message" - # fi - - # to_json \ - # summary \ - # total_tests passed_tests failed_tests \ - # badge_label badge_color badge_message \ - # > output/sitrep.json - - # schemaVersion=1 \ - # label="${badge_label}" \ - # message="${badge_message}" \ - # color="${badge_color}" \ - # to_json schemaVersion label message color \ - # > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json - - # - name: Upload training logs as artifacts - # uses: actions/upload-artifact@v4 - # with: - # name: ${{ steps.meta.outputs.JOB_NAME }} - # path: output/* - - # vit-single-process-multi-device: - # strategy: - # max-parallel: 1 - # matrix: - # N_GPU: [8] - # fail-fast: false - # runs-on: jumpbox - # env: - # BADGE_FILENAME_PREFIX: badge-rosetta-t5x-vit-single-process-multi-device - # steps: - # - name: Print environment variables - # run: env - - # - name: Setup SSH agent - # uses: webfactory/ssh-agent@v0.9.0 - # with: - # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - # - name: Check out the repository under ${GITHUB_WORKSPACE} - # uses: actions/checkout@v4 - - # - name: Setup SSH known hosts - # id: ssh-known-hosts - # run: | - # mkdir -p ~/.ssh - # cat >> ~/.ssh/known_hosts << EOF - # ${{ vars.SSH_KNOWN_HOSTS }} - # EOF - # chmod 600 ~/.ssh/known_hosts - # echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - # - name: Labels and metadata - # id: meta - # shell: bash -x -e {0} - # run: | - # IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" - # TEST_CASE_NAME=VIT1P${{ matrix.N_GPU }}G - # JOB_NAME=${{ inputs.FW_NAME }}-vit-${GITHUB_RUN_ID}-${TEST_CASE_NAME} - # LOG_FILE=/nfs/cluster/${JOB_NAME}.log - # MODEL_PATH=/nfs/cluster/${JOB_NAME} - # BATCH_SIZE=$((${{ env.VIT_BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) - # for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do - # echo "$var=${!var}" >> $GITHUB_OUTPUT - # done - - # - name: Submit SLURM jobs over SSH - # id: submit - # shell: bash -O expand_aliases -x -e {0} - # run: | - # cd $GITHUB_WORKSPACE - # alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - # sshx "date && hostname && sinfo" - # sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - # JOB=$(sshx sbatch --parsable << EOF - # #!/bin/bash - # #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - # #SBATCH --exclusive - # #SBATCH --nodes=1 - # #SBATCH --gpus-per-node=${{ matrix.N_GPU }} - # #SBATCH --time=00:30:00 - # #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - # #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" - - # # preload enroot container using one task per node - # time srun \ - # --ntasks-per-node=1 \ - # --container-name=runtime \ - # --container-image=${{ steps.meta.outputs.IMAGE }} \ - # true - - # # run job with tasks on each node sharing one container - # time srun \ - # --ntasks=1 \ - # --container-name=runtime \ - # --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - # --container-entrypoint \ - # test-vit.sh \ - # --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ - # --dtype bfloat16 \ - # --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} - # EOF - # ) - - # . .github/workflows/scripts/wait_for_slurm_job.sh - - # wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - - # # Gather job info - # SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - # SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - # echo "SLURM Job state is ${SLURM_STATE}" - # echo "SLURM Job exit code is ${SLURM_EXITCODE}" - # echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - # echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - - # set -x - - # - name: Retrieve training logs and upload to TensorBoard server - # shell: bash -x -e {0} - # run: | - # cd $GITHUB_WORKSPACE - # mkdir output/ - # rsync -rtz --progress \ - # ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ - # output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - # rsync -rtz --progress \ - # ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ - # output/ || true - # rsync -rtz --progress \ - # output/ \ - # ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-vit-${GITHUB_RUN_ID}/ || true - - # - name: Write SLURM job status to file - # shell: bash -x -e {0} - # run: | - # python << EOF - # import json - # with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - # dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - # json.dump(dump, f) - # EOF - - # - name: Generate sitrep - # if: success() || failure() - # shell: bash -x -e {0} - # run: | - # # bring in utility functions - # cd $GITHUB_WORKSPACE - # source .github/workflows/scripts/to_json.sh - - # EXIT_STATUSES="output/*-status.json" - # badge_label='ROSETTA T5X VIT SINGLE PROCESS MULTI DEVICE ${{ steps.meta.outputs.TEST_CASE_NAME }}' - # passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) - # failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) - # total_tests=$(ls $EXIT_STATUSES | wc -l) - - # if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then - # badge_message='error' - # badge_color=red - # summary="ROSETTA T5X VIT SINGLE PROCESS MULTI DEVICE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message" - # else - # badge_message="${passed_tests}/${total_tests} passed" - # if [[ ${failed_tests} == 0 ]]; then - # badge_color=brightgreen - # else - # badge_color=yellow - # fi - # summary="ROSETTA T5X VIT SINGLE PROCESS MULTI DEVICE ${{ steps.meta.outputs.TEST_CASE_NAME }}: $badge_message" - # fi - - # to_json \ - # summary \ - # total_tests passed_tests failed_tests \ - # badge_label badge_color badge_message \ - # > output/sitrep.json - - # schemaVersion=1 \ - # label="${badge_label}" \ - # message="${badge_message}" \ - # color="${badge_color}" \ - # to_json schemaVersion label message color \ - # > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json - - # - name: Upload training logs as artifacts - # uses: actions/upload-artifact@v4 - # with: - # name: ${{ steps.meta.outputs.JOB_NAME }} - # path: output/* - vit-multi-gpu-multi-node: strategy: max-parallel: 1 matrix: N_GPU: [8] - # 2-node configs temporarily disabled - N_NODE: [1] # , 2] + N_NODE: [1] fail-fast: false runs-on: jumpbox env: @@ -760,7 +207,7 @@ jobs: metrics: name: test-t5x-rosetta-metrics - needs: [single-process-multi-device, vit-multi-gpu-multi-node] # vit-single-process-multi-device, multi-gpu-multi-node + needs: [vit-multi-gpu-multi-node] runs-on: ubuntu-22.04 steps: @@ -774,7 +221,7 @@ jobs: shell: bash -eux {0} run: | pip install 'numpy<2.0.0' pytest pytest-reportlog tensorboard - for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-* ${{ inputs.FW_NAME }}-vit-${GITHUB_RUN_ID}-*; do + for i in ${{ inputs.FW_NAME }}-vit-${GITHUB_RUN_ID}-*; do JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}') METRIC_PATH=${JOB_NAME}_metrics.json python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --perf_summary_name "timing/steps_per_second" --output_json_path $METRIC_PATH @@ -806,7 +253,7 @@ jobs: summary: name: test-t5x-rosetta-summary runs-on: ubuntu-22.04 - needs: [single-process-multi-device, vit-multi-gpu-multi-node] # multi-gpu-multi-node, vit-single-process-multi-device + needs: [vit-multi-gpu-multi-node] if: "!cancelled()" steps: - name: Generate TensorBoard query URL diff --git a/.github/workflows/_test_upstream_t5x.yaml b/.github/workflows/_test_upstream_t5x.yaml deleted file mode 100644 index 892290327..000000000 --- a/.github/workflows/_test_upstream_t5x.yaml +++ /dev/null @@ -1,394 +0,0 @@ -name: ~test T5X, multi-node - -on: - workflow_call: - inputs: - T5X_IMAGE: - type: string - description: T5X image from ghcr.io/nvidia - default: 'ghcr.io/nvidia/upstream-t5x:latest' - required: false - BATCH_SIZE_PER_GPU: - type: number - description: Batch size per GPU - default: 32 - required: false - BADGE_FILENAME: - type: string - description: 'Name of the endpoint JSON file for shields.io badge' - required: false - default: 'badge-upstream-t5x-mgmn-test.json' - ARTIFACT_NAME: - type: string - description: 'Name of the artifact zip file' - required: false - default: 'artifact-upstream-t5x-mgmn-test' - FW_NAME: - type: string - description: 'Name of the framework being used' - required: false - default: 'upstream-t5x' - outputs: - TEST_STATUS: - description: 'Summary pass/fail value indicating if results from tests are acceptable' - value: ${{ jobs.sitrep.outputs.STATUS }} - -jobs: - - t5x-multi-gpu: - strategy: - max-parallel: 1 - matrix: - include: - - TEST_NAME: "1P8G" - N_GPU: 8 - # - TEST_NAME: "1P2G_fmha" - # N_GPU: 2 - # ADDITIONAL_ARGS: "--enable-fmha 1" - fail-fast: false - runs-on: jumpbox - steps: - - name: Print environment variables - run: env - - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - - name: Setup SSH - id: setup-ssh - uses: ./.github/actions/setup-ssh - with: - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - - - name: Labels and metadata - id: meta - shell: bash -x -e {0} - run: | - IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" - TEST_CASE_NAME=${{ matrix.TEST_NAME }} - JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME} - LOG_FILE=/nfs/cluster/${JOB_NAME}.log - MODEL_PATH=/nfs/cluster/${JOB_NAME} - BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }})) - for var in IMAGE TEST_CASE_NAME JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do - echo "$var=${!var}" >> $GITHUB_OUTPUT - done - - - name: Submit SLURM jobs over SSH - id: submit - shell: bash -O expand_aliases -x -e {0} - run: | - alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - sshx "date && hostname && sinfo" - sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - JOB=$(sshx sbatch --parsable << EOF - #!/bin/bash - #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - #SBATCH --exclusive - #SBATCH --nodes=1 - #SBATCH --gpus-per-node=${{ matrix.N_GPU }} - #SBATCH --time=00:30:00 - #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" - - # preload enroot container using one task per node - time srun \ - --ntasks-per-node=1 \ - --container-name=runtime \ - --container-image=${{ steps.meta.outputs.IMAGE }} \ - true - - # run job with tasks on each node sharing one container - time srun \ - --ntasks=1 \ - --container-name=runtime \ - --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - --container-entrypoint \ - test-t5x.sh \ - --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ - --dtype bfloat16 \ - --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ - --epochs 7 \ - --steps-per-epoch 100 \ - ${{ matrix.ADDITIONAL_ARGS }} - EOF - ) - - echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT - - . .github/workflows/scripts/wait_for_slurm_job.sh - - wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - - # Gather job info - SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - echo "SLURM Job state is ${SLURM_STATE}" - echo "SLURM Job exit code is ${SLURM_EXITCODE}" - echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - - set -x - - - name: Remove orphaned SLURM job if the CI job is canceled - if: cancelled() - shell: bash -x -e {0} - run: | - ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ - scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - - - name: Retrieve training logs and upload to TensorBoard server - shell: bash -x -e {0} - run: | - mkdir output/ - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ - output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - rsync -rtz --progress \ - ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ - output/ || true - rsync -rtz --progress \ - output/ \ - ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - - - name: Write SLURM job status to file - shell: bash -x -e {0} - run: | - python << EOF - import json - with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - json.dump(dump, f) - EOF - - - name: Upload training logs as artifacts - uses: actions/upload-artifact@v4 - with: - name: ${{ steps.meta.outputs.JOB_NAME }} - path: output/* - - # 2-node tests temporarily disabled - # t5x-multi-node: - # strategy: - # max-parallel: 1 - # matrix: - # include: - # - TEST_NAME: "8G2N" - # N_GPU: 8 - # N_NODE: 2 - # ADDITIONAL_ARGS: "" - # - TEST_NAME: "8G2N_fmha" - # N_GPU: 8 - # N_NODE: 2 - # ADDITIONAL_ARGS: "--enable-fmha 1" - # fail-fast: false - # runs-on: jumpbox - # steps: - # - name: Print environment variables - # run: env - - # - name: Check out the repository under ${GITHUB_WORKSPACE} - # uses: actions/checkout@v4 - - # - name: Setup SSH agent - # uses: webfactory/ssh-agent@v0.9.0 - # with: - # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - - # - name: Setup SSH known hosts - # id: ssh-known-hosts - # run: | - # mkdir -p ~/.ssh - # cat >> ~/.ssh/known_hosts << EOF - # ${{ vars.SSH_KNOWN_HOSTS }} - # EOF - # chmod 600 ~/.ssh/known_hosts - # echo "FILE=$(realpath ~/.ssh/known_hosts)" >> $GITHUB_OUTPUT - - # - name: Labels and metadata - # id: meta - # shell: bash -x -e {0} - # run: | - # IMAGE="$(echo ${{inputs.T5X_IMAGE}} | sed 's/\//#/')" - # TEST_CASE_NAME=${{ matrix.TEST_NAME }} - # TOTAL_TASKS=$((${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - # JOB_NAME=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-${TEST_CASE_NAME}; - # LOG_FILE=/nfs/cluster/${JOB_NAME}.log - # MODEL_PATH=/nfs/cluster/${JOB_NAME} - # BATCH_SIZE=$((${{ inputs.BATCH_SIZE_PER_GPU }} * ${{ matrix.N_GPU }} * ${{ matrix.N_NODE }})) - # for var in IMAGE TEST_CASE_NAME TOTAL_TASKS JOB_NAME LOG_FILE MODEL_PATH BATCH_SIZE; do - # echo "$var=${!var}" >> $GITHUB_OUTPUT - # done - - # - name: Submit SLURM jobs over SSH - # id: submit - # shell: bash -O expand_aliases -x -e {0} - # run: | - # alias sshx='ssh -o "ServerAliveInterval 7" ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}' - # sshx "date && hostname && sinfo" - # sshx mkdir -p ${{ steps.meta.outputs.MODEL_PATH }} - # JOB=$(sshx sbatch --parsable << EOF - # #!/bin/bash - # #SBATCH --job-name=${{ steps.meta.outputs.JOB_NAME }} - # #SBATCH --exclusive - # #SBATCH --nodes=${{ matrix.N_NODE }} - # #SBATCH --gpus-per-node=${{ matrix.N_GPU }} - # #SBATCH --time=00:30:00 - # #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }} - # #SBATCH --export="ENROOT_PASSWORD=${{ secrets.GITHUB_TOKEN }}" - - # # preload enroot container using one task per node - # time srun \ - # --ntasks-per-node=1 \ - # --container-name=runtime \ - # --container-image=${{ steps.meta.outputs.IMAGE }} \ - # true - - # # run job with tasks on each node sharing one container - # time srun \ - # --tasks=${{ steps.meta.outputs.TOTAL_TASKS }} \ - # --tasks-per-node=${{ matrix.N_GPU }} \ - # --container-name=runtime \ - # --container-mounts=${{ steps.meta.outputs.MODEL_PATH }}:/output \ - # --container-entrypoint \ - # test-t5x.sh \ - # --output /output/${{ steps.meta.outputs.TEST_CASE_NAME }} \ - # --dtype bfloat16 \ - # --batch-size ${{ steps.meta.outputs.BATCH_SIZE }} \ - # --epochs 7 \ - # --steps-per-epoch 100 \ - # --multiprocess \ - # ${{ matrix.ADDITIONAL_ARGS }} - # EOF - # ) - - # echo "SLURM_JOB_ID=${JOB}" >> $GITHUB_OUTPUT - - # . .github/workflows/scripts/wait_for_slurm_job.sh - - # wait_for_slurm_job ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} ${JOB} - - # # Gather job info - # SLURM_STATE=$(sshx sacct -j $JOB --format=State --parsable2 --noheader |& head -n 1) - # SLURM_EXITCODE=$(sshx sacct -j $JOB --format=exitcode --parsable2 --noheader | sort -r -u | head -1 | cut -f 1 -d":" | sed 's/ //g') - # echo "SLURM Job state is ${SLURM_STATE}" - # echo "SLURM Job exit code is ${SLURM_EXITCODE}" - # echo "SLURM_STATE=${SLURM_STATE}" >> "$GITHUB_OUTPUT" - # echo "SLURM_EXITCODE=${SLURM_EXITCODE}" >> "$GITHUB_OUTPUT" - - # set -x - - # - name: Remove orphaned SLURM job if the CI job is canceled - # if: cancelled() - # shell: bash -x -e {0} - # run: | - # ssh ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }} \ - # scancel ${{ steps.submit.outputs.SLURM_JOB_ID }} - - # - name: Retrieve training logs and upload to TensorBoard server - # shell: bash -x -e {0} - # run: | - - # mkdir output/ - # rsync -rtz --progress \ - # ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.LOG_FILE }} \ - # output/${{ steps.meta.outputs.TEST_CASE_NAME }}.log || true - # rsync -rtz --progress \ - # ${{ secrets.CLUSTER_LOGIN_USER }}@${{ vars.HOSTNAME_SLURM_LOGIN }}:${{ steps.meta.outputs.MODEL_PATH }}/* \ - # output/ || true - # rsync -rtz --progress \ - # output/ \ - # ${{ secrets.TENSORBOARD_UPLOAD_USER }}@${{ vars.HOSTNAME_TENSORBOARD }}:/tensorboard-logs/${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}/ || true - - # - name: Write SLURM job status to file - # shell: bash -x -e {0} - # run: | - # python << EOF - # import json - # with open("output/${{ steps.meta.outputs.TEST_CASE_NAME }}-status.json", "w") as f: - # dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} - # json.dump(dump, f) - # EOF - - # - name: Upload training logs as artifacts - # uses: actions/upload-artifact@v4 - # with: - # name: ${{ steps.meta.outputs.JOB_NAME }} - # path: output/* - - metrics: - name: test-upstream-t5x-metrics - needs: [t5x-multi-gpu] # t5x-multi-node - runs-on: ubuntu-22.04 - - steps: - - name: Check out the repository under ${GITHUB_WORKSPACE} - uses: actions/checkout@v4 - - - name: Download artifacts - uses: actions/download-artifact@v4 - - - name: Run pytest - shell: bash -eux {0} - run: | - pip install 'numpy<2.0.0' pytest pytest-reportlog tensorboard - for i in ${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}-*; do - JOB_NAME=$(echo $i | awk -F "${GITHUB_RUN_ID}-" '{print $2}') - METRIC_PATH=${JOB_NAME}_metrics.json - python3 .github/workflows/baselines/summarize_metrics.py $i/$JOB_NAME --perf_summary_name "timing/steps_per_second" --output_json_path $METRIC_PATH - # Test script expects the job dir and the log to be in the CWD - mv $i/$JOB_NAME $i/${JOB_NAME}.log . - done - - RESULTS_DIR=$PWD BASELINES_DIR=T5X_MGMN/upstream pytest --report-log=report.jsonl .github/workflows/baselines/test_t5x_mgmn_metrics.py || true - - - name: Upload metrics test json logs - uses: actions/upload-artifact@v4 - with: - name: ${{ inputs.FW_NAME }}-metrics-test-log - path: | - report.jsonl - *_metrics.json - - - sitrep: - name: test-upstream-t5x-sitrep - needs: metrics - if: "!cancelled()" - uses: ./.github/workflows/_sitrep_mgmn.yaml - secrets: inherit - with: - BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }} - ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }} - FW_NAME: ${{ inputs.FW_NAME }} - - summary: - name: test-upstream-t5x-summary - runs-on: ubuntu-22.04 - needs: [t5x-multi-gpu] # t5x-multi-node - if: "!cancelled()" - steps: - - name: Generate TensorBoard query URL - run: | - ( - cat << EOF - - ## T5X MGMN training - - [view metrics](https://${{ vars.HOSTNAME_TENSORBOARD }}/#scalars®exInput=${{ inputs.FW_NAME }}-${GITHUB_RUN_ID}&_smoothingWeight=0&tagFilter=seqs_per) - - EOF - ) | tee $GITHUB_STEP_SUMMARY - - outcome: - name: test-upstream-t5x-outcome - needs: sitrep - runs-on: ubuntu-22.04 - if: "!cancelled()" - steps: - - name: Sets workflow status based on test outputs - run: | - if [[ ${{ needs.sitrep.outputs.STATUS }} != 'success' ]]; then - exit 1 - fi diff --git a/.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP1TP1PP.json b/.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP1TP1PP.json deleted file mode 100644 index 9898f40b6..000000000 --- a/.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP1TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.19963198900222778,0,0,0,0,0,0,0,0],"step_times":[0.340298334757487,0.13511633376280466,0.13490866621335348,0.13431999584039053,0.1360036681095759,0.13434800008932749,0.13548333446184793,0.13538900017738342,0.13546699782212576],"step_time_avg":0.15792603680381068,"e2e_time_seconds":59.38633333333333,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"} diff --git a/.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP8TP1PP.json b/.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP8TP1PP.json deleted file mode 100644 index 02664d450..000000000 --- a/.github/workflows/baselines/MAXTEXT/upstream/1DP1FSDP8TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.19989807903766632,0,0,0,0,0,0,0,0],"step_times":[0.22164533535639444,0.2257696638504664,0.20632266998291016,0.20477033158143362,0.2050279974937439,0.2048743317524592,0.205293337504069,0.2059936672449112,0.2050470014413198],"step_time_avg":0.20941603735641193,"e2e_time_seconds":173.058,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"} diff --git a/.github/workflows/baselines/MAXTEXT/upstream/1DP2FSDP4TP1PP_single_process.json b/.github/workflows/baselines/MAXTEXT/upstream/1DP2FSDP4TP1PP_single_process.json index 07c427871..490948d42 100644 --- a/.github/workflows/baselines/MAXTEXT/upstream/1DP2FSDP4TP1PP_single_process.json +++ b/.github/workflows/baselines/MAXTEXT/upstream/1DP2FSDP4TP1PP_single_process.json @@ -1 +1,33 @@ -{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.19917848706245422,0,0,0,0,0,0,0,0],"step_times":[0.27129199107487995,0.17545133332411447,0.1774536669254303,0.18130967020988464,0.17997999986012778,0.17623033126195273,0.17701533436775208,0.17688766618569693,0.1763359953959783],"step_time_avg":0.1879951098450908,"e2e_time_seconds":89.195,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"} +{ + "start_step": 1, + "end_step": 9, + "step_interval": 1, + "loss_values": [ + 24.950599670410156, + 23.6308536529541, + 22.22606086730957, + 20.85379981994629, + 19.61219596862793, + 18.5745849609375, + 17.77853012084961, + 17.22124481201172, + 16.864944458007812 + ], + "step_times": [ + 0.34216299653053284, + 0.20161199569702148, + 0.20115399360656738, + 0.19551700353622437, + 0.19348999857902527, + 0.19705399870872498, + 0.19354699552059174, + 0.1978529989719391, + 0.19385899603366852 + ], + "step_time_avg": 0.2129165530204773, + "e2e_time_seconds": 34.555, + "run_urls": [ + "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/14054620516/artifacts" + ], + "date": "2025-03-25" +} diff --git a/.github/workflows/baselines/MAXTEXT/upstream/1DP4FSDP2TP1PP.json b/.github/workflows/baselines/MAXTEXT/upstream/1DP4FSDP2TP1PP.json deleted file mode 100644 index 941b3f2a9..000000000 --- a/.github/workflows/baselines/MAXTEXT/upstream/1DP4FSDP2TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.20010541379451752,3.576278402306343e-07,0,0,0,0,0,0,0],"step_times":[0.2389533370733261,0.16767800350983939,0.16975000500679016,0.16171966989835104,0.1687556654214859,0.1680160015821457,0.1645423322916031,0.16466433803240457,0.15995866556962332],"step_time_avg":0.17378200204284103,"e2e_time_seconds":180.69966666666667,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"} diff --git a/.github/workflows/baselines/MAXTEXT/upstream/1DP8FSDP1TP1PP.json b/.github/workflows/baselines/MAXTEXT/upstream/1DP8FSDP1TP1PP.json deleted file mode 100644 index e19974954..000000000 --- a/.github/workflows/baselines/MAXTEXT/upstream/1DP8FSDP1TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.19963198900222778,0,0,0,0,0,0,0,0],"step_times":[0.21862300237019858,0.15024366478125253,0.12854566673437753,0.12942766646544138,0.13015000025431314,0.13066466649373373,0.13325033088525137,0.12929299970467886,0.12958466509977976],"step_time_avg":0.1421980736432252,"e2e_time_seconds":166.42833333333334,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"} diff --git a/.github/workflows/baselines/MAXTEXT/upstream/2DP2FSDP2TP1PP.json b/.github/workflows/baselines/MAXTEXT/upstream/2DP2FSDP2TP1PP.json index 78648bc87..e6a84022c 100644 --- a/.github/workflows/baselines/MAXTEXT/upstream/2DP2FSDP2TP1PP.json +++ b/.github/workflows/baselines/MAXTEXT/upstream/2DP2FSDP2TP1PP.json @@ -1 +1,33 @@ -{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.20010541379451752,3.576278402306343e-07,0,0,0,0,0,0,0],"step_times":[0.2453316698471705,0.15532933175563812,0.15473033487796783,0.15553300082683563,0.15387233098347983,0.1558946669101715,0.15286600093046823,0.1544460008541743,0.15370899935563406],"step_time_avg":0.1646347040379489,"e2e_time_seconds":184.883,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"} +{ + "start_step": 1, + "end_step": 9, + "step_interval": 1, + "loss_values": [ + 24.9504337310791, + 23.63066291809082, + 22.226167678833008, + 20.853334426879883, + 19.611804962158203, + 18.574153900146484, + 17.778240203857422, + 17.22144317626953, + 16.864635467529297 + ], + "step_times": [ + 0.31094300746917725, + 0.21168699860572815, + 0.16145099699497223, + 0.1525229960680008, + 0.153779998421669, + 0.15510499477386475, + 0.1530109941959381, + 0.1535159945487976, + 0.15335600078105927 + ], + "step_time_avg": 0.17837466465102303, + "e2e_time_seconds": 32.688, + "run_urls": [ + "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/14054620516/artifacts" + ], + "date": "2025-03-25" +} diff --git a/.github/workflows/baselines/MAXTEXT/upstream/4DP2FSDP2TP1PP.json b/.github/workflows/baselines/MAXTEXT/upstream/4DP2FSDP2TP1PP.json deleted file mode 100644 index 286efbae3..000000000 --- a/.github/workflows/baselines/MAXTEXT/upstream/4DP2FSDP2TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step":1,"end_step":9,"step_interval":1,"loss_values":[0.20010541379451752,3.576278402306343e-07,0,0,0,0,0,0,0],"step_times":[0.24482366939385733,0.2246866673231125,0.2056473344564438,0.21987300117810568,0.23971499999364218,0.21608999868234,0.21471566955248514,0.22115066647529602,0.22203166782855988],"step_time_avg":0.22319263054264918,"e2e_time_seconds":185.15866666666668,"run_urls":["https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729826963/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7729993999/artifacts","https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/7731914601/artifacts"],"date":"2024-01-31"} diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/16DP1FSDP1TP1PP_TE.json b/.github/workflows/baselines/PAX_MGMN/rosetta/16DP1FSDP1TP1PP_TE.json deleted file mode 100644 index c5329467e..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/16DP1FSDP1TP1PP_TE.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004531083977781236, 2.0986779418308288e-05, 1.31147601223347e-06], "step_times": [9.16663678487142, 9.166899998982748, 9.190437952677408], "step_time_avg": 9.174658245510523, "e2e_time_seconds": 286.86566666666664} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/1DP1FSDP1TP1PP_TE.json b/.github/workflows/baselines/PAX_MGMN/rosetta/1DP1FSDP1TP1PP_TE.json deleted file mode 100644 index 62ed20b41..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/1DP1FSDP1TP1PP_TE.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004550429875962436, 2.098800177918747e-05, 1.3114761259203078e-06], "step_times": [9.69591999053955, 9.694547653198242, 9.6983060836792], "step_time_avg": 9.696257909138998, "e2e_time_seconds": 193.862} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/1DP2FSDP4TP1PP_single_process_TE.json b/.github/workflows/baselines/PAX_MGMN/rosetta/1DP2FSDP4TP1PP_single_process_TE.json deleted file mode 100644 index a255dc62b..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/1DP2FSDP4TP1PP_single_process_TE.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004496203036978841, 2.1072781237307936e-05, 1.311417690885719e-06, 6.332992796842518e-08, 0.0], "step_times": [7.595651865005493, 7.599909543991089, 7.602108001708984, 7.595033645629883, 7.59737229347229], "step_time_avg": 7.598015069961548, "e2e_time_seconds": 234.1085} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/1DP8FSDP1TP1PP_TE.json b/.github/workflows/baselines/PAX_MGMN/rosetta/1DP8FSDP1TP1PP_TE.json deleted file mode 100644 index ac98f8443..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/1DP8FSDP1TP1PP_TE.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004562355752568692, 2.0987012248951942e-05, 1.31147601223347e-06], "step_times": [9.896685918172201, 9.902073860168457, 9.896770795186361], "step_time_avg": 9.898510191175673, "e2e_time_seconds": 282.402} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/2DP1FSDP1TP4PP.json b/.github/workflows/baselines/PAX_MGMN/rosetta/2DP1FSDP1TP4PP.json deleted file mode 100644 index a309a4a68..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/2DP1FSDP1TP4PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.000438039714936167, 2.219043562945444e-05, 1.4306265256891493e-06], "step_times": [2.470784823099772, 2.471130927403768, 2.471168835957845], "step_time_avg": 2.4710281954871287, "e2e_time_seconds": 316.19} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/2DP1FSDP2TP4PP.json b/.github/workflows/baselines/PAX_MGMN/rosetta/2DP1FSDP2TP4PP.json deleted file mode 100644 index f79ed51bb..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/2DP1FSDP2TP4PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004382363404147327, 2.253897037007846e-05, 1.4306265256891493e-06], "step_times": [1.6266847054163616, 1.6370126803716023, 1.6351629098256428], "step_time_avg": 1.6329534318712022, "e2e_time_seconds": 393.96866666666665} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/4DP1FSDP2TP1PP.json b/.github/workflows/baselines/PAX_MGMN/rosetta/4DP1FSDP2TP1PP.json deleted file mode 100644 index 368097a8c..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/4DP1FSDP2TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004683640436269343, 2.1106745407450944e-05, 1.311883579546702e-06], "step_times": [6.465008894602458, 6.465569972991943, 6.463742891947429], "step_time_avg": 6.464773919847276, "e2e_time_seconds": 285.65200000000004} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/4DP1FSDP2TP1PP_TE.json b/.github/workflows/baselines/PAX_MGMN/rosetta/4DP1FSDP2TP1PP_TE.json deleted file mode 100644 index d900af22e..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/4DP1FSDP2TP1PP_TE.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004528434365056455, 2.122193473041989e-05, 1.3954693258710904e-06], "step_times": [8.431368192036947, 8.43382708231608, 8.431408246358236], "step_time_avg": 8.432201173570421, "e2e_time_seconds": 287.23} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/8DP1FSDP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/rosetta/8DP1FSDP1TP1PP.json deleted file mode 100644 index 8ad323abf..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/8DP1FSDP1TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004681030404753983, 2.086862332362216e-05, 1.3121162965035182e-06], "step_times": [7.275770664215088, 7.267403920491536, 7.267686367034912], "step_time_avg": 7.270286983913845, "e2e_time_seconds": 273.26233333333334} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/8DP1FSDP1TP1PP_TE.json b/.github/workflows/baselines/PAX_MGMN/rosetta/8DP1FSDP1TP1PP_TE.json deleted file mode 100644 index 52a386ce0..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/8DP1FSDP1TP1PP_TE.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0004531083977781236, 2.0986779418308288e-05, 1.31147601223347e-06], "step_times": [9.474847793579102, 9.474331537882486, 9.4735320409139], "step_time_avg": 9.474237124125162, "e2e_time_seconds": 281.1556666666667} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/rosetta/8DP_TE_dropout.json b/.github/workflows/baselines/PAX_MGMN/rosetta/8DP_TE_dropout.json deleted file mode 100644 index 928612d1e..000000000 --- a/.github/workflows/baselines/PAX_MGMN/rosetta/8DP_TE_dropout.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 300, "step_interval": 100, "loss_values": [0.0005329704144969583, 2.392743408563547e-05, 1.6679570080668782e-06], "step_times": [7.743874549865723, 7.7442946434021, 7.74159049987793], "step_time_avg": 7.743253231048584, "e2e_time_seconds": 307.527} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/16DP1FSDP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/upstream/16DP1FSDP1TP1PP.json deleted file mode 100644 index 17eaca746..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/16DP1FSDP1TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004681030404753983, 2.086862332362216e-05, 1.3121162965035182e-06, 5.8207657444020455e-11, 0.0], "step_times": [7.725894292195638, 7.695748964945476, 7.674357891082764, 7.7014509836832685, 7.720887184143066], "step_time_avg": 7.703667863210043, "e2e_time_seconds": 295.4576666666666} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/1DP1FSDP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/upstream/1DP1FSDP1TP1PP.json deleted file mode 100644 index 89aaeca73..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/1DP1FSDP1TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046809131163172424, 2.086867971229367e-05, 1.3123492408340098e-06, 5.8207657444020455e-11, 0.0], "step_times": [8.008778889973959, 8.014708836873373, 8.011429150899252, 8.013259251912435, 8.00814119974772], "step_time_avg": 8.011263465881347, "e2e_time_seconds": 204.86966666666663} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/1DP2FSDP4TP1PP_single_process.json b/.github/workflows/baselines/PAX_MGMN/upstream/1DP2FSDP4TP1PP_single_process.json deleted file mode 100644 index bd06ec03c..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/1DP2FSDP4TP1PP_single_process.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004679674166254699, 2.0868097635684535e-05, 1.3118251445121132e-06, 6.335539382007482e-08, 0.0], "step_times": [6.345967451731364, 6.3443193435668945, 6.345146497090657, 6.344050407409668, 6.3422525723775225], "step_time_avg": 6.344347254435221, "e2e_time_seconds": 238.7543333333333} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/1DP8FSDP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/upstream/1DP8FSDP1TP1PP.json deleted file mode 100644 index ab5c598ac..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/1DP8FSDP1TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.00046810254571028054, 2.0870078515144996e-05, 1.3122327118253452e-06, 5.8207657444020455e-11, 0.0], "step_times": [8.161738077799479, 8.162349383036295, 8.15965493520101, 8.158018112182617, 8.157390912373861], "step_time_avg": 8.159830284118652, "e2e_time_seconds": 296.75933333333336} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/2DP1FSDP1TP4PP.json b/.github/workflows/baselines/PAX_MGMN/upstream/2DP1FSDP1TP4PP.json deleted file mode 100644 index 97371666e..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/2DP1FSDP1TP4PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.000438039714936167, 2.219043562945444e-05, 1.4306265256891493e-06, 5.8207657444020455e-11, 0.0], "step_times": [2.5234107971191406, 2.5232578118642173, 2.5235915184020996, 2.5234344005584717, 2.5233071645100913], "step_time_avg": 2.523400338490804, "e2e_time_seconds": 388.4533333333334} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/2DP1FSDP2TP4PP.json b/.github/workflows/baselines/PAX_MGMN/upstream/2DP1FSDP2TP4PP.json deleted file mode 100644 index 85d1e5d4c..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/2DP1FSDP2TP4PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004382363404147327, 2.253897037007846e-05, 1.4306265256891493e-06, 5.8207657444020455e-11, 0.0], "step_times": [1.6573359568913777, 1.654531757036845, 1.6514259179433186, 1.652300516764323, 1.6523643334706624], "step_time_avg": 1.653591696421305, "e2e_time_seconds": 507.07099999999997} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/4DP1FSDP2TP1PP.json b/.github/workflows/baselines/PAX_MGMN/upstream/4DP1FSDP2TP1PP.json deleted file mode 100644 index 43e34bbce..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/4DP1FSDP2TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004683640436269343, 2.1106745407450944e-05, 1.311883579546702e-06, 5.8207657444020455e-11, 0.0], "step_times": [6.97843599319458, 6.977321783701579, 6.975888093312581, 6.976069609324138, 6.976081212361653], "step_time_avg": 6.976759338378906, "e2e_time_seconds": 306.4506666666667} \ No newline at end of file diff --git a/.github/workflows/baselines/PAX_MGMN/upstream/8DP1FSDP1TP1PP.json b/.github/workflows/baselines/PAX_MGMN/upstream/8DP1FSDP1TP1PP.json deleted file mode 100644 index 762a1bc9f..000000000 --- a/.github/workflows/baselines/PAX_MGMN/upstream/8DP1FSDP1TP1PP.json +++ /dev/null @@ -1 +0,0 @@ -{"start_step": 100, "end_step": 500, "step_interval": 100, "loss_values": [0.0004681030404753983, 2.086862332362216e-05, 1.3121162965035182e-06, 5.8207657444020455e-11, 0.0], "step_times": [7.965368588765462, 7.962141513824463, 7.961517333984375, 7.960983753204346, 7.957266171773274], "step_time_avg": 7.961455472310384, "e2e_time_seconds": 291.03866666666664} \ No newline at end of file diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/1N1G-te-1.json b/.github/workflows/baselines/T5X_MGMN/rosetta/1N1G-te-1.json deleted file mode 100644 index 4d4d02e03..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/1N1G-te-1.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 24153.919921875, - 18200.547526041668, - 16711.00390625, - 15931.573567708334, - 15485.792643229166, - 14638.229166666666, - 14497.9931640625 - ], - "step_times": [ - 16.47681490580241, - 17.68256441752116, - 17.76396878560384, - 17.746811548868816, - 17.440324783325195, - 17.567402521769207, - 17.809424082438152 - ], - "step_time_avg": 17.498187292189826, - "e2e_time_seconds": 50.457, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/1N8G-te-1.json b/.github/workflows/baselines/T5X_MGMN/rosetta/1N8G-te-1.json deleted file mode 100644 index 3b23678da..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/1N8G-te-1.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 184496.02083333334, - 132556.78125, - 118741.51041666667, - 112102, - 106474.15625, - 102533.91666666667, - 98940.96875 - ], - "step_times": [ - 7.723111152648926, - 10.411026954650879, - 10.323720932006836, - 10.33003012339274, - 10.171146392822266, - 10.210242907206217, - 10.345155080159506 - ], - "step_time_avg": 9.930633363269623, - "e2e_time_seconds": 51.413000000000004, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/1P1G_te-0.json b/.github/workflows/baselines/T5X_MGMN/rosetta/1P1G_te-0.json deleted file mode 100644 index 7b5a56b18..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/1P1G_te-0.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 24058.563802083332, - 18159.987630208332, - 16757.4765625, - 16054.7275390625, - 15671.132161458334, - 14891.440755208334, - 14785.358723958334 - ], - "step_times": [ - 2.516990343729655, - 20.410126368204754, - 20.358519236246746, - 20.480276107788086, - 20.413113276163738, - 20.514227549235027, - 20.529977798461914 - ], - "step_time_avg": 17.889032954261417, - "e2e_time_seconds": 95.94066666666667, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/1P1G_te-1.json b/.github/workflows/baselines/T5X_MGMN/rosetta/1P1G_te-1.json deleted file mode 100644 index 5288e8366..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/1P1G_te-1.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 24152.125, - 18200.479817708332, - 16711.490885416668, - 15932.138671875, - 15485.988932291666, - 14638.2822265625, - 14498.707356770834 - ], - "step_times": [ - 16.6195125579834, - 17.70514488220215, - 17.761261622111004, - 17.79443422953288, - 17.802642186482746, - 17.263482411702473, - 17.815998077392578 - ], - "step_time_avg": 17.53749656677246, - "e2e_time_seconds": 71.54266666666668, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/1P8G_te-1.json b/.github/workflows/baselines/T5X_MGMN/rosetta/1P8G_te-1.json deleted file mode 100644 index 1f003af14..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/1P8G_te-1.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 184948.375, - 132454.71875, - 119301.75520833333, - 111850.2734375, - 106656.45052083333, - 102391.19791666667, - 98660.5546875 - ], - "step_times": [ - 7.569497108459473, - 8.135419845581055, - 8.108287652333578, - 8.119354883829752, - 8.113591512044271, - 8.524995803833008, - 9.187231063842773 - ], - "step_time_avg": 8.251196838560558, - "e2e_time_seconds": 84.15966666666667, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/2N2G_te-0.json b/.github/workflows/baselines/T5X_MGMN/rosetta/2N2G_te-0.json deleted file mode 100644 index bc1f34bae..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/2N2G_te-0.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 93512.08072916667, - 68011.20052083333, - 62676.079427083336, - 59229.05859375, - 56724.227864583336, - 55185.227864583336, - 54274.234375 - ], - "step_times": [ - 2.4313742319742837, - 16.954547882080078, - 16.901138305664062, - 17.1687068939209, - 17.038618723551433, - 17.15284029642741, - 16.860179265340168 - ], - "step_time_avg": 14.929629371279761, - "e2e_time_seconds": 51.90366666666667, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/2N8G-te-1.json b/.github/workflows/baselines/T5X_MGMN/rosetta/2N8G-te-1.json deleted file mode 100644 index 38c336a24..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/2N8G-te-1.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 365773.1354166667, - 259264.546875, - 232126.52604166666, - 216850.84895833334, - 206546.31770833334, - 197197.40104166666, - 189904.16145833334 - ], - "step_times": [ - 1.3103941679000854, - 1.4087820053100586, - 1.4035348892211914, - 1.3380741675694783, - 1.4081446329752605, - 1.4064313570658367, - 1.4399555921554565 - ], - "step_time_avg": 1.3879024017424813, - "e2e_time_seconds": 52.197, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1G1N.json b/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1G1N.json deleted file mode 100644 index f9804cfd0..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1G1N.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "start_step": 100, - "end_step": 500, - "step_interval": 100, - "loss_values": [ - 6.5204572677612305, - 4.362146536509196, - 2.4585838317871094, - 2.3129119078318277, - 2.296177943547567 - ], - "step_times": [ - 4.789742787679036, - 4.7799530029296875, - 4.815515836079915, - 4.816166400909424, - 4.81501833597819 - ], - "step_time_avg": 4.80327927271525, - "e2e_time_seconds": 51.43133333333333, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1G2N.json b/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1G2N.json deleted file mode 100644 index 3d29e750c..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1G2N.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "start_step": 100, - "end_step": 500, - "step_interval": 100, - "loss_values": [ - 6.523322423299153, - 4.372343381245931, - 2.4592310587565103, - 2.312021334966024, - 2.29211433728536 - ], - "step_times": [ - 4.418381532033284, - 4.498948574066162, - 4.580581188201904, - 4.584135850270589, - 4.587292830149333 - ], - "step_time_avg": 4.533867994944255, - "e2e_time_seconds": 51.27533333333333, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1P8G.json b/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1P8G.json deleted file mode 100644 index 051843745..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT1P8G.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "start_step": 100, - "end_step": 500, - "step_interval": 100, - "loss_values": [ - 6.519473075866699, - 4.362549622853597, - 2.4568604628245034, - 2.3104422092437744, - 2.2859185536702475 - ], - "step_times": [ - 1.2238010168075562, - 1.2661592562993367, - 1.2517070372899373, - 1.2438101768493652, - 1.2496572335561116 - ], - "step_time_avg": 1.2470269441604613, - "e2e_time_seconds": 51.833666666666666, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT8G2N.json b/.github/workflows/baselines/T5X_MGMN/rosetta/VIT8G2N.json deleted file mode 100644 index fd4900ff5..000000000 --- a/.github/workflows/baselines/T5X_MGMN/rosetta/VIT8G2N.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "start_step": 100, - "end_step": 500, - "step_interval": 100, - "loss_values": [ - 6.519150098164876, - 4.360336621602376, - 2.457017421722412, - 2.3106113274892173, - 2.2869962056477866 - ], - "step_times": [ - 3.559901714324951, - 4.227313041687012, - 4.242276032765706, - 4.264291922251384, - 4.272752126057942 - ], - "step_time_avg": 4.1133069674174, - "e2e_time_seconds": 75.54066666666667, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1G1N.json b/.github/workflows/baselines/T5X_MGMN/upstream/1G1N.json deleted file mode 100644 index 31c72c276..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1G1N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 21865.04296875, - 16778.55078125, - 15394.390625, - 14834.7900390625, - 14571.5107421875, - 13927.41015625, - 13905.013671875 - ], - "step_times": [ - 2.473292112350464, - 17.985205332438152, - 18.19552739461263, - 18.03304926554362, - 18.031944910685223, - 18.190677007039387, - 18.142311096191406 - ], - "step_time_avg": 15.864572445551554, - "e2e_time_seconds": 63.13766666666667, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1G2N.json b/.github/workflows/baselines/T5X_MGMN/upstream/1G2N.json deleted file mode 100644 index 335569788..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1G2N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 42944.29296875, - 32115.787109375, - 29564.89453125, - 28500.9140625, - 27571.00390625, - 26453.740234375, - 26083.974609375 - ], - "step_times": [ - 2.3437534173329673, - 13.097546895345053, - 13.210731824239096, - 13.215600649515787, - 13.055537223815918, - 13.16451358795166, - 13.058845202128092 - ], - "step_time_avg": 11.592361257189795, - "e2e_time_seconds": 47.98266666666666, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1P1G.json b/.github/workflows/baselines/T5X_MGMN/upstream/1P1G.json deleted file mode 100644 index 6fe10cd06..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1P1G.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 21865.04296875, - 16778.55078125, - 15394.390625, - 14834.7900390625, - 14571.5107421875, - 13927.41015625, - 13905.013671875 - ], - "step_times": [ - 2.47874116897583, - 17.988933563232422, - 18.21541976928711, - 17.84804407755534, - 18.093478520711262, - 18.190120061238606, - 18.059662501017254 - ], - "step_time_avg": 15.83919995171683, - "e2e_time_seconds": 82.35000000000001, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1P1G_fmha.json b/.github/workflows/baselines/T5X_MGMN/upstream/1P1G_fmha.json deleted file mode 100644 index 5255585b3..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1P1G_fmha.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 21865.04296875, - 16778.55078125, - 15394.390625, - 14834.7900390625, - 14571.5107421875, - 13927.41015625, - 13905.013671875 - ], - "step_times": [ - 2.4637417793273926, - 18.070358276367188, - 18.185569127400715, - 18.15062967936198, - 18.173222223917644, - 18.198484420776367, - 18.11294428507487 - ], - "step_time_avg": 15.907849970318024, - "e2e_time_seconds": 47.749, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1P2G.json b/.github/workflows/baselines/T5X_MGMN/upstream/1P2G.json deleted file mode 100644 index 1ac8c54bf..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1P2G.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 42809.5703125, - 31982.818359375, - 29629.173828125, - 28579.404296875, - 27443.494140625, - 26543.869140625, - 26117.07421875 - ], - "step_times": [ - 2.3347439765930176, - 17.100131352742512, - 17.403623580932617, - 17.190806070963543, - 17.344409306844074, - 17.314453125, - 17.230820337931316 - ], - "step_time_avg": 15.131283964429583, - "e2e_time_seconds": 67.003, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1P2G_fmha.json b/.github/workflows/baselines/T5X_MGMN/upstream/1P2G_fmha.json deleted file mode 100644 index 5234895ee..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1P2G_fmha.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 42809.5703125, - 31982.818359375, - 29629.173828125, - 28579.404296875, - 27443.494140625, - 26543.869140625, - 26117.07421875 - ], - "step_times": [ - 2.3423378467559814, - 17.1409117380778, - 17.402149836222332, - 17.29240608215332, - 17.291365305582683, - 17.392317454020183, - 17.15788968404134 - ], - "step_time_avg": 15.145625420979092, - "e2e_time_seconds": 63.98333333333334, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1P4G.json b/.github/workflows/baselines/T5X_MGMN/upstream/1P4G.json deleted file mode 100644 index 1cf7e35a2..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1P4G.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 84190.9453125, - 62587.01953125, - 57130.765625, - 54206.9921875, - 52445.33984375, - 50840.7734375, - 50487.34375 - ], - "step_times": [ - 2.079073746999105, - 16.785479227701824, - 17.152976353963215, - 17.007422765096027, - 17.030206044514973, - 17.1840763092041, - 17.014991124471027 - ], - "step_time_avg": 14.893460795992896, - "e2e_time_seconds": 60.213, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/1P8G.json b/.github/workflows/baselines/T5X_MGMN/upstream/1P8G.json deleted file mode 100644 index 9f974cac5..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/1P8G.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 166564.625, - 121425.8984375, - 110409.046875, - 104837.1875, - 101569.234375, - 100145.3046875, - 99009.4453125 - ], - "step_times": [ - 1.160822868347168, - 16.73763910929362, - 16.92938804626465, - 17.105731964111328, - 17.199840545654297, - 16.785912195841473, - 17.00780423482259 - ], - "step_time_avg": 14.703876994905016, - "e2e_time_seconds": 48.071666666666665, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/2G1N.json b/.github/workflows/baselines/T5X_MGMN/upstream/2G1N.json deleted file mode 100644 index 90a1ddc3a..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/2G1N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 42944.29296875, - 32115.787109375, - 29564.89453125, - 28500.9140625, - 27571.00390625, - 26453.740234375, - 26083.974609375 - ], - "step_times": [ - 2.391838232676188, - 16.899702707926433, - 17.412066141764324, - 17.324010848999023, - 17.375378290812176, - 17.41155942281087, - 17.307125727335613 - ], - "step_time_avg": 15.160240196046376, - "e2e_time_seconds": 47.81833333333333, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/2G2N.json b/.github/workflows/baselines/T5X_MGMN/upstream/2G2N.json deleted file mode 100644 index c8942fc31..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/2G2N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 84881.8203125, - 62609.671875, - 57477.609375, - 54356.4453125, - 52232.98046875, - 51183.7890625, - 50615.93359375 - ], - "step_times": [ - 2.3189565340677896, - 14.426481564839682, - 15.366949081420898, - 14.962293942769369, - 14.883244832356771, - 15.368088722229004, - 14.990182876586914 - ], - "step_time_avg": 13.188028222038632, - "e2e_time_seconds": 47.80100000000001, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/2G2N_fmha.json b/.github/workflows/baselines/T5X_MGMN/upstream/2G2N_fmha.json deleted file mode 100644 index 0d67b192a..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/2G2N_fmha.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 84881.8203125, - 62609.671875, - 57477.609375, - 54356.4453125, - 52232.98046875, - 51183.7890625, - 50615.93359375 - ], - "step_times": [ - 2.3478333155314126, - 14.952726046244303, - 15.38317584991455, - 14.957984606424967, - 14.925978342692057, - 15.261824289957682, - 15.095227241516113 - ], - "step_time_avg": 13.274964241754441, - "e2e_time_seconds": 47.87633333333333, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/4G1N.json b/.github/workflows/baselines/T5X_MGMN/upstream/4G1N.json deleted file mode 100644 index 78a194565..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/4G1N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 85074.609375, - 62618.43359375, - 57475.515625, - 54443.35546875, - 52452.90234375, - 51453.50390625, - 50904.84375 - ], - "step_times": [ - 2.3716205755869546, - 16.59337552388509, - 17.15019162495931, - 16.815487543741863, - 16.90180206298828, - 16.74636459350586, - 16.827538172403973 - ], - "step_time_avg": 14.772340013867334, - "e2e_time_seconds": 47.494, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/4G2N.json b/.github/workflows/baselines/T5X_MGMN/upstream/4G2N.json deleted file mode 100644 index 491b6e0e9..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/4G2N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 166386.734375, - 121687.125, - 110239.375, - 105225.1171875, - 101571.890625, - 100226.734375, - 99198.7890625 - ], - "step_times": [ - 2.3323241074879966, - 15.210881868998209, - 16.050008455912273, - 15.354175249735514, - 15.846360206604004, - 15.817280133565268, - 15.525897343953451 - ], - "step_time_avg": 13.733846766608103, - "e2e_time_seconds": 48.040666666666674, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/8G1N.json b/.github/workflows/baselines/T5X_MGMN/upstream/8G1N.json deleted file mode 100644 index 79995a1dc..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/8G1N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 166136.0625, - 121262.1171875, - 109982.3359375, - 105095.59375, - 101807.7890625, - 100457.359375, - 100047.9453125 - ], - "step_times": [ - 2.2884463469187417, - 16.788105010986328, - 17.029067357381184, - 17.20726267496745, - 17.159779230753582, - 17.193702697753906, - 17.069965362548828 - ], - "step_time_avg": 14.962332668758576, - "e2e_time_seconds": 48.57266666666667, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/8G2N.json b/.github/workflows/baselines/T5X_MGMN/upstream/8G2N.json deleted file mode 100644 index d3f19ab9e..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/8G2N.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 329745.75, - 238854.71875, - 216490, - 206838.921875, - 202744.78125, - 200420.703125, - 199783.03125 - ], - "step_times": [ - 2.2293623288472495, - 15.984151522318522, - 16.54537582397461, - 16.404622395833332, - 16.47221310933431, - 16.532596588134766, - 16.46229362487793 - ], - "step_time_avg": 14.375802199045816, - "e2e_time_seconds": 50.278999999999996, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/T5X_MGMN/upstream/8G2N_fmha.json b/.github/workflows/baselines/T5X_MGMN/upstream/8G2N_fmha.json deleted file mode 100644 index 6a1785a0a..000000000 --- a/.github/workflows/baselines/T5X_MGMN/upstream/8G2N_fmha.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "start_step": 100, - "end_step": 700, - "step_interval": 100, - "loss_values": [ - 329745.75, - 238854.71875, - 216490, - 206838.921875, - 202744.78125, - 200420.703125, - 199783.03125 - ], - "step_times": [ - 2.2479323546091714, - 15.95144526163737, - 16.547892252604168, - 16.44203758239746, - 16.48525047302246, - 16.44901402791341, - 16.404547373453777 - ], - "step_time_avg": 14.361159903662546, - "e2e_time_seconds": 48.32, - "run_urls": [ - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8324237226/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8314453358/artifacts", - "https://api.github.com/repos/NVIDIA/JAX-Toolbox/actions/runs/8306837544/artifacts" - ], - "date": "2024-03-19" -} diff --git a/.github/workflows/baselines/create_baselines.sh b/.github/workflows/baselines/create_baselines.sh index 10d1d9cb6..086d7efa6 100755 --- a/.github/workflows/baselines/create_baselines.sh +++ b/.github/workflows/baselines/create_baselines.sh @@ -46,7 +46,10 @@ elif [[ "$TYPE" == "rosetta-t5x" ]]; then ) OUTPUT_DIR=T5X_MGMN/rosetta elif [[ "$TYPE" == "upstream-maxtext" ]]; then - CONFIGS=("1DP1FSDP1TP1PP" "1DP1FSDP8TP1PP" "1DP2FSDP4TP1PP_single_process" "1DP4FSDP2TP1PP" "1DP8FSDP1TP1PP" "2DP2FSDP2TP1PP" "4DP2FSDP2TP1PP") + CONFIGS=( + "1DP2FSDP4TP1PP_single_process" + "2DP2FSDP2TP1PP" + ) OUTPUT_DIR=MAXTEXT/upstream else usage @@ -60,8 +63,10 @@ bash ${UTIL_DIR}/download_artifacts.sh ${ALL_WF_RUNS[@]} URLS=() for WORKFLOW_RUN in ${ALL_WF_RUNS[@]}; do for CFG in ${CONFIGS[@]}; do - if [[ $(find . -mindepth 1 -maxdepth 2 -type d -name $CFG | wc -l) -ne 1 ]]; then - echo "Expected one artifact to have a '$CFG' dir under '$PWD', but found $(find . -mindepth 1 -maxdepth 2 -type d -name $CFG)" + CFG=$TYPE-$WORKFLOW_RUN-$CFG + ARTS=$(find . -mindepth 1 -maxdepth 2 -type d -name $CFG) + if (( $(echo ${ARTS} | wc -l) != 1 )); then + echo "Expected one artifact to have a '$CFG' dir under '$PWD', but found ${ARTS}" exit 1 fi done diff --git a/.github/workflows/baselines/summarize_metrics.py b/.github/workflows/baselines/summarize_metrics.py index 1e1e13009..5fc52e1a0 100644 --- a/.github/workflows/baselines/summarize_metrics.py +++ b/.github/workflows/baselines/summarize_metrics.py @@ -40,7 +40,7 @@ def main(): if not os.path.exists(searchpath): searchpath = os.path.join(args.test_config, "summaries/train") if not os.path.exists(searchpath): - searchpath = os.path.join(args.test_config, "logdir/tensorboard") + searchpath = os.path.join(args.test_config, "logdir/tensorboard/logdir") assert os.path.exists(searchpath), f"Neither {args.test_config}/train nor {args.test_config}/summaries/train nor {args.test_config}/logdir/tensorboard dirs exist" event_files = glob.glob(os.path.join(searchpath, "events*")) assert len(event_files) > 0, f"{searchpath} did not contain a tensorboard events file" diff --git a/.github/workflows/baselines/test_maxtext_metrics.py b/.github/workflows/baselines/test_maxtext_metrics.py index a130c86c6..6626f0a25 100644 --- a/.github/workflows/baselines/test_maxtext_metrics.py +++ b/.github/workflows/baselines/test_maxtext_metrics.py @@ -2,10 +2,11 @@ import os import json import glob -import sys +from numpy.testing import assert_allclose import test_utils from statistics import mean +LOSS_RTOL = 0.10 STEP_TIME_MULT = 0.95 E2E_TIME_MULT = 0.95 test_dir = os.path.dirname(os.path.abspath(__file__)) @@ -22,9 +23,22 @@ def test_loss(baseline_filename): event_file = os.path.join(results_dir, test_config, "logdir/tensorboard/logdir/events*") event_file = glob.glob(event_file)[0] with open(baseline_filepath, "r") as baseline_file: - end_step = json.load(baseline_file)["end_step"] + baseline_data = json.load(baseline_file) + loss_expected_values = baseline_data["loss_values"] + start_step = baseline_data["start_step"] + end_step = baseline_data["end_step"] + interval = baseline_data["step_interval"] + loss_expected = {step: loss_expected_values[i] for i, step in enumerate( + range(start_step, end_step+1, interval))} loss_actual = test_utils.read_maxtext_tb_tag(event_file, loss_summary_name) - assert 0 <= loss_actual[end_step] < 1.8e-3, f"Loss at final step: {loss_actual[end_step]}, Expected 0 <= loss < 1.8e-3" + del loss_actual[0] # removing the very first step + assert loss_expected.keys() == loss_actual.keys(), \ + f"Steps at which loss was emitted for run do not match baseline. \ + Actual steps: {loss_actual.keys()}, Baseline steps: {loss_expected.keys()}" + assert_allclose(list(loss_actual.values()), list(loss_expected.values()), + rtol=LOSS_RTOL, + err_msg=f"Run loss values: {loss_actual.values()}, \ + Baseline loss values: {loss_expected.values()}") @pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) diff --git a/.github/workflows/baselines/test_pax_mgmn_metrics.py b/.github/workflows/baselines/test_pax_mgmn_metrics.py deleted file mode 100644 index e52b8d686..000000000 --- a/.github/workflows/baselines/test_pax_mgmn_metrics.py +++ /dev/null @@ -1,56 +0,0 @@ -import pytest -import os -import json -import glob -import sys -import test_utils -from statistics import mean - -STEP_TIME_MULT = 0.95 -E2E_TIME_MULT = 0.95 -test_dir = os.path.dirname(os.path.abspath(__file__)) -baselines_dir = os.path.join(test_dir, os.environ.get("BASELINES_DIR")) -results_dir = os.environ.get("RESULTS_DIR") -loss_summary_name = "loss" -step_time_summary_name = "Steps/sec" - - -@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) -def test_loss(baseline_filename): - baseline_filepath = os.path.join(baselines_dir, baseline_filename) - test_config = baseline_filename.split(".")[0] - event_file = os.path.join(results_dir, test_config, "summaries/train/events*") - event_file = glob.glob(event_file)[0] - with open(baseline_filepath, "r") as baseline_file: - end_step = json.load(baseline_file)["end_step"] - loss_actual = test_utils.read_tb_tag(event_file, loss_summary_name) - assert 0 <= loss_actual[end_step] < 1.8e-6, f"Loss at final step: {loss_actual[end_step]}, Expected 0 <= loss < 1.8e-6" - - -@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) -def test_step_time(baseline_filename): - baseline_filepath = os.path.join(baselines_dir, baseline_filename) - test_config = baseline_filename.split(".")[0] - event_file = os.path.join(results_dir, test_config, "summaries/train/events*") - event_file = glob.glob(event_file)[0] - with open(baseline_filepath, "r") as baseline_file: - step_time_avg_expected = json.load(baseline_file)["step_time_avg"] - step_time_dict = test_utils.read_tb_tag(event_file, step_time_summary_name) - step_time_values = [step_time_dict[i] for i in sorted(step_time_dict.keys())] - ## exclude the first steps/sec value from the average - ## because it includes compilation time - step_time_avg_actual = mean(step_time_values[1:]) - assert step_time_avg_actual > step_time_avg_expected * \ - STEP_TIME_MULT, f"Step time values: {step_time_values} (Avg: {step_time_avg_actual}), Expected avg: {step_time_avg_expected}" - - -@pytest.mark.parametrize("baseline_filename", os.listdir(baselines_dir)) -def test_e2e_time(baseline_filename): - baseline_filepath = os.path.join(baselines_dir, baseline_filename) - test_config = baseline_filename.split(".")[0] - run_log = os.path.join(results_dir, test_config + ".log") - with open(baseline_filepath, "r") as baseline_file: - e2e_time_expected = json.load(baseline_file)["e2e_time_seconds"] - e2e_time_actual = test_utils.read_e2e_time(run_log) - assert e2e_time_actual < e2e_time_expected / \ - E2E_TIME_MULT, f"Run E2E time: {e2e_time_actual}, Expected E2E time: {e2e_time_expected}" diff --git a/.github/workflows/mjx-build-test.yaml b/.github/workflows/mjx-build-test.yaml deleted file mode 100644 index d91bf8850..000000000 --- a/.github/workflows/mjx-build-test.yaml +++ /dev/null @@ -1,204 +0,0 @@ -name: MJX build -run-name: MJX build (${{ github.event_name == 'workflow_run' && format('nightly {0}', github.event.workflow_run.created_at) || github.event_name }}) - -on: - schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - workflow_dispatch: - inputs: - BASE_IMAGE_AMD64: - type: string - description: 'JAX mealkit AMD64 image built by NVIDIA/JAX-Toolbox' - default: '' - required: false - BASE_IMAGE_ARM64: - type: string - description: 'JAX mealkit AMD64 image built by NVIDIA/JAX-Toolbox' - default: '' - required: false - PUBLISH: - type: boolean - description: Publish dated images and update the 'latest' tag? - default: false - required: false - - -env: - DOCKER_REGISTRY: ghcr.io/nvidia - DEFAULT_BASE_IMAGE: ghcr.io/nvidia/jax-mealkit:jax - - -permissions: - contents: read # to fetch code - actions: write # to cancel previous workflows - packages: write # to upload container - -jobs: - - metadata: - runs-on: ubuntu-22.04 - outputs: - PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} - BASE_IMAGE_AMD64: ${{ steps.base-image.outputs.BASE_IMAGE_AMD64 }} - BASE_IMAGE_ARM64: ${{ steps.base-image.outputs.BASE_IMAGE_ARM64 }} - BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} - - steps: - - - name: Cancel workflow if upstream workflow did not success - if: ${{ steps.if-upstream-failed.outputs.UPSTREAM_FAILED == 'true' }} - run: | - echo "Upstream workflow failed, cancelling this workflow" - curl -X POST -H "Authorization: token ${{ github.token }}" \ - -H "Accept: application/vnd.github.v3+json" \ - "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel" - cat # blocks execution in case workflow cancellation takes time - - - name: Determine if the resulting container should be 'published' - id: if-publish - shell: bash -x -e {0} - run: - # A container should be published if: - # 1) the workflow is triggered by workflow_dispatch and the PUBLISH input is true, or - # 2) the workflow is triggered by workflow_run (i.e., a nightly build) - echo "PUBLISH=${{ github.event_name == 'workflow_run' || (github.event_name == 'workflow_dispatch' && inputs.PUBLISH) }}" >> $GITHUB_OUTPUT - - - name: Set build date - id: date - shell: bash -x -e {0} - run: | - BUILD_DATE=$(TZ='US/Los_Angeles' date '+%Y-%m-%d') - echo "BUILD_DATE=${BUILD_DATE}" >> $GITHUB_OUTPUT - - - name: Set base image - id: base-image - shell: bash -x -e {0} - run: | - if [[ -z "${{ inputs.BASE_IMAGE }}" ]]; then - BASE_IMAGE_AMD64=${{ env.DEFAULT_BASE_IMAGE }} - BASE_IMAGE_ARM64=${{ env.DEFAULT_BASE_IMAGE }} - else - BASE_IMAGE_AMD64=${{ inputs.BASE_IMAGE_AMD64 }} - BASE_IMAGE_ARM64=${{ inputs.BASE_IMAGE_ARM64 }} - fi - echo "BASE_IMAGE_AMD64=${BASE_IMAGE_AMD64}" >> $GITHUB_OUTPUT - echo "BASE_IMAGE_ARM64=${BASE_IMAGE_ARM64}" >> $GITHUB_OUTPUT - - amd64: - needs: metadata - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: amd64 - ARTIFACT_NAME: artifact-mjx-build - BADGE_FILENAME: badge-mjx-build - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_AMD64 }} - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - CONTAINER_NAME: mjx - DOCKERFILE: .github/container/Dockerfile.mjx - secrets: inherit - - arm64: - needs: metadata - uses: ./.github/workflows/_build.yaml - with: - ARCHITECTURE: arm64 - ARTIFACT_NAME: artifact-mjx-build - BADGE_FILENAME: badge-mjx-build - BASE_IMAGE: ${{ needs.metadata.outputs.BASE_IMAGE_ARM64 }} - BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} - CONTAINER_NAME: mjx - DOCKERFILE: .github/container/Dockerfile.mjx - secrets: inherit - - publish-mealkit: - needs: [metadata, amd64, arm64] - if: false - #if: needs.metadata.outputs.PUBLISH == 'true' - uses: ./.github/workflows/_publish_container.yaml - with: - ARTIFACT_NAME: mealkit-mjx - ARTIFACT_TAG: mjx-mealkit-${{ needs.metadata.outputs.BUILD_DATE }} - SOURCE_IMAGE: | - ${{ needs.amd64.outputs.DOCKER_TAG_MEALKIT }} - ${{ needs.arm64.outputs.DOCKER_TAG_MEALKIT }} - TARGET_IMAGE: jax - TARGET_TAGS: | - type=raw,value=mjx-mealkit,priority=500 - type=raw,value=mjx-mealkit-${{ needs.metadata.outputs.BUILD_DATE }},priority=500 - - publish-final: - needs: [metadata, amd64, arm64] - if: false - #if: needs.metadata.outputs.PUBLISH == 'true' - uses: ./.github/workflows/_publish_container.yaml - with: - ARTIFACT_NAME: final-mjx - ARTIFACT_TAG: mjx-nightly-${{ needs.metadata.outputs.BUILD_DATE }} - SOURCE_IMAGE: | - ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} - ${{ needs.arm64.outputs.DOCKER_TAG_FINAL }} - TARGET_IMAGE: jax - TARGET_TAGS: | - type=raw,value=mjx-latest,priority=1000 - type=raw,value=mjx-nightly-${{ needs.metadata.outputs.BUILD_DATE }},priority=900 - - # disabled because the build is failing and this workflow needs reworking not to block the slurm cluster - # small perf tests - # runner: - # uses: ./.github/workflows/_runner_ondemand_slurm.yaml - # with: - # NAME: "A100-${{ github.run_id }}" - # LABELS: "A100:${{ github.run_id }}" - # TIME: "01:00:00" - # secrets: inherit - - # mjx-unit-test: - # needs: amd64 - # strategy: - # fail-fast: false - # matrix: - # GPU_ARCH: [A100] - # # ensures A100 job lands on dedicated runner for this particular job - # runs-on: [self-hosted, "${{ matrix.GPU_ARCH == 'A100' && format('{0}:{1}', matrix.GPU_ARCH, github.run_id) || matrix.GPU_ARCH }}"] - # steps: - # - name: Print environment variables - # run: env - - # - name: Print GPU information - # run: nvidia-smi - - # - name: Check out repository - # uses: actions/checkout@v4 - - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v3 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - - # - name: Pull MJX image - # shell: bash -x -e {0} - # run: | - # docker pull ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} - - # - name: MJX speed test - # shell: bash -x -e {0} - # continue-on-error: true - # run: | - # docker run --gpus=all --shm-size=1g ${{ needs.amd64.outputs.DOCKER_TAG_FINAL }} bash -ec "mjx-testspeed --mjcf=humanoid/humanoid.xml --batch_size=8192 --unroll=4 --output=tsv" | tee -a test-mjx.log - - # - name: Save perf to summary - # shell: bash -x -e {0} - # continue-on-error: true - # run: | - # SUMMARY_PATTERN="^mjx-testspeed" - # SUMMARY=$(cat test-mjx.log | grep "$SUMMARY_PATTERN") - # echo "${SUMMARY}" | tee -a $GITHUB_STEP_SUMMARY - - # - name: Upload artifacts - # uses: actions/upload-artifact@v4 - # with: - # name: ${{ env.DEFAULT_ARTIFACT_NAME }}-${{ matrix.GPU_ARCH }} - # path: | - # test-mjx.log diff --git a/README.md b/README.md index d50469c9b..6e6570b9d 100644 --- a/README.md +++ b/README.md @@ -154,9 +154,7 @@ We support and test the following JAX frameworks and model architectures. More d