Add deleted test_te workflow

aybchan · aybchan · commit 0670a2930d82 · 2025-02-27T17:35:34.000Z
diff --git a/.github/workflows/_test_te.yml b/.github/workflows/_test_te.yml
@@ -0,0 +1,116 @@
+name: ~test TransformerEngine
+
+ on:
+   workflow_call:
+     inputs:
+       TE_IMAGE:
+         type: string
+         description: 'JAX+TE+PAXML image'
+         required: true
+         default: 'ghcr.io/nvidia/upstream-pax:latest'
+       ARTIFACT_PREFIX:
+         type: string
+         description: 'Name of the artifact zip file'
+         required: false
+         default: 'te'
+
+ jobs:
+   te-multi-gpu:
+     uses: ./.github/workflows/_test_slurm_pyxis.yaml
+     strategy:
+       matrix:
+         N_GPU: [2, 4, 8]
+       fail-fast: false
+     secrets:
+       SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
+       SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
+       CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
+     with:
+       NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
+       SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
+       OUTPUT_BASEDIR: /nfs/cluster
+       OUTPUT_MOUNTPOINT: /output
+       NODES: 1
+       GPUS_PER_NODE: ${{ matrix.N_GPU }}
+       NTASKS: 1
+       NTASKS_PER_NODE: 1
+       TIME_LIMIT: '00:10:00'
+       EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
+       IMAGE: ${{ inputs.TE_IMAGE }}
+       SRUN_PREAMBLE: |
+         nvidia-smi
+         pip install \
+           pytest \
+           pytest-reportlog \
+           cuda-python \
+           -r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
+       SRUN_SCRIPT: |
+         set -ex
+         cd ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder
+         pytest --report-log=/output/pytest-report.jsonl \
+           test_single_gpu_encoder.py \
+           test_multigpu_encoder.py \
+           test_model_parallel_encoder.py
+
+   sitrep:
+     needs: [te-multi-gpu, te-unittests]
+     if: success() || failure()
+     runs-on: ubuntu-latest
+     env:
+       ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
+       BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
+     steps:
+       - name: Check out the repository under ${GITHUB_WORKSPACE}
+         uses: actions/checkout@v4
+
+       - name: Download artifacts
+         uses: actions/download-artifact@v4
+         with:
+           pattern: |
+             ${{ inputs.ARTIFACT_PREFIX }}-*
+           merge-multiple: true
+
+       - name: Generate sitrep
+         shell: bash -x -e {0}
+         run: |
+           # bring in utility functions
+           source .github/workflows/scripts/to_json.sh
+           test_outcome_files=$(find -name pytest-report.jsonl)
+           badge_label='TE Multi GPU tests'
+           passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
+           failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
+           total_tests=$((failed_tests + passed_tests))
+           
+           if [[ ${total_tests} == 0 ]]; then
+             badge_message='error'
+             badge_color=red
+             summary='TE multi GPU tests did not complete due to errors.'
+           else
+             badge_message="${passed_tests}/${total_tests} passed"
+             if [[ ${failed_tests} == 0 ]]; then
+               badge_color=brightgreen
+             else
+               badge_color=yellow
+             fi
+             summary="TE multi GPU tests : $badge_message"
+           fi
+           run_id=${{ github.run_id }} \
+           to_json \
+             run_id \
+             summary \
+             total_tests passed_tests failed_tests \
+             badge_label badge_color badge_message \
+           > sitrep.json
+           schemaVersion=1 \
+           label="${badge_label}" \
+           message="${badge_message}" \
+           color="${badge_color}" \
+           to_json schemaVersion label message color \
+           > ${{ env.BADGE_FILENAME_FULL }}
+       - name: Upload training logs as artifacts
+         uses: actions/upload-artifact@v4
+         with:
+           name: ${{ env.ARTIFACT_NAME_FULL }}
+           path: |
+             sitrep.json
+             ${{ env.BADGE_FILENAME_FULL }}