Skip to content

Commit 0670a29

Browse files
committed
Add deleted test_te workflow
1 parent a6071e1 commit 0670a29

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

.github/workflows/_test_te.yml

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
name: ~test TransformerEngine
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
TE_IMAGE:
7+
type: string
8+
description: 'JAX+TE+PAXML image'
9+
required: true
10+
default: 'ghcr.io/nvidia/upstream-pax:latest'
11+
ARTIFACT_PREFIX:
12+
type: string
13+
description: 'Name of the artifact zip file'
14+
required: false
15+
default: 'te'
16+
17+
jobs:
18+
te-multi-gpu:
19+
uses: ./.github/workflows/_test_slurm_pyxis.yaml
20+
strategy:
21+
matrix:
22+
N_GPU: [2, 4, 8]
23+
fail-fast: false
24+
secrets:
25+
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
26+
SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
27+
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
28+
with:
29+
NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
30+
SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
31+
OUTPUT_BASEDIR: /nfs/cluster
32+
OUTPUT_MOUNTPOINT: /output
33+
NODES: 1
34+
GPUS_PER_NODE: ${{ matrix.N_GPU }}
35+
NTASKS: 1
36+
NTASKS_PER_NODE: 1
37+
TIME_LIMIT: '00:10:00'
38+
EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
39+
IMAGE: ${{ inputs.TE_IMAGE }}
40+
SRUN_PREAMBLE: |
41+
nvidia-smi
42+
pip install \
43+
pytest \
44+
pytest-reportlog \
45+
cuda-python \
46+
-r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
47+
SRUN_SCRIPT: |
48+
set -ex
49+
cd ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder
50+
pytest --report-log=/output/pytest-report.jsonl \
51+
test_single_gpu_encoder.py \
52+
test_multigpu_encoder.py \
53+
test_model_parallel_encoder.py
54+
55+
sitrep:
56+
needs: [te-multi-gpu, te-unittests]
57+
if: success() || failure()
58+
runs-on: ubuntu-latest
59+
env:
60+
ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
61+
BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
62+
steps:
63+
- name: Check out the repository under ${GITHUB_WORKSPACE}
64+
uses: actions/checkout@v4
65+
66+
- name: Download artifacts
67+
uses: actions/download-artifact@v4
68+
with:
69+
pattern: |
70+
${{ inputs.ARTIFACT_PREFIX }}-*
71+
merge-multiple: true
72+
73+
- name: Generate sitrep
74+
shell: bash -x -e {0}
75+
run: |
76+
# bring in utility functions
77+
source .github/workflows/scripts/to_json.sh
78+
test_outcome_files=$(find -name pytest-report.jsonl)
79+
badge_label='TE Multi GPU tests'
80+
passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
81+
failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
82+
total_tests=$((failed_tests + passed_tests))
83+
84+
if [[ ${total_tests} == 0 ]]; then
85+
badge_message='error'
86+
badge_color=red
87+
summary='TE multi GPU tests did not complete due to errors.'
88+
else
89+
badge_message="${passed_tests}/${total_tests} passed"
90+
if [[ ${failed_tests} == 0 ]]; then
91+
badge_color=brightgreen
92+
else
93+
badge_color=yellow
94+
fi
95+
summary="TE multi GPU tests : $badge_message"
96+
fi
97+
run_id=${{ github.run_id }} \
98+
to_json \
99+
run_id \
100+
summary \
101+
total_tests passed_tests failed_tests \
102+
badge_label badge_color badge_message \
103+
> sitrep.json
104+
schemaVersion=1 \
105+
label="${badge_label}" \
106+
message="${badge_message}" \
107+
color="${badge_color}" \
108+
to_json schemaVersion label message color \
109+
> ${{ env.BADGE_FILENAME_FULL }}
110+
- name: Upload training logs as artifacts
111+
uses: actions/upload-artifact@v4
112+
with:
113+
name: ${{ env.ARTIFACT_NAME_FULL }}
114+
path: |
115+
sitrep.json
116+
${{ env.BADGE_FILENAME_FULL }}

0 commit comments

Comments
 (0)