Skip to content

Commit ec474b5

Browse files
authored
Update TransformerEngine test configuration (#1470)
- [x] Remove `L1_jax_distributed_unittest` TransformerEngine unit tests
1 parent d26c2b9 commit ec474b5

File tree

4 files changed

+41
-23
lines changed

4 files changed

+41
-23
lines changed
Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
set -xu -o pipefail
2+
3+
LOG_DIR=/opt/output
4+
15
pip install pytest-reportlog pytest-xdist
26

37
# Start MPS daemon
@@ -6,21 +10,19 @@
610
# TE's default is slightly different, without the hyphen
711
export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
812

9-
# 1 GPU per worker, 6 workers per GPU
10-
pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh | tee -a ${LOG_DIR}/pytest_stdout.log
13+
# 1 GPU per worker, 4 workers per GPU
14+
pytest-xdist.sh 1 4 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh | tee -a ${LOG_DIR}/pytest_stdout.log
1115

1216
# 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation
1317
# into a single .jsonl file of results from multiple pytest invocations
1418
# inside the test.sh script, so it's useful even with a single worker per
1519
# device.
16-
pytest-xdist.sh 8 1 pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh | tee -a ${LOG_DIR}/pytest_stdout.log
17-
pytest-xdist.sh 8 1 pytest-report-L1-distributed-unittest.jsonl bash ${TE_PATH}/qa/L1_jax_distributed_unittest/test.sh | tee -a ${LOG_DIR}/pytest_stdout.log
20+
pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh | tee -a ${LOG_DIR}/pytest_stdout.log
1821

1922
# merge the log files
2023
cat \
21-
pytest-report-L0-unittest.jsonl \
22-
pytest-report-L0-distributed-unittest.jsonl \
23-
pytest-report-L1-distributed-unittest.jsonl \
24+
${LOG_DIR}/pytest-report-L0-unittest.jsonl \
25+
${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \
2426
> ${LOG_DIR}/pytest-report.jsonl
2527

2628
touch ${LOG_DIR}/done

.github/workflows/_ci.yaml

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ jobs:
490490
CI_NAME: transformer-engine
491491
secrets: inherit
492492

493-
test-te-unit-a100:
493+
test-te-a100:
494494
needs: build-jax
495495
secrets: inherit
496496
if: >-
@@ -506,30 +506,32 @@ jobs:
506506
docker run -i --gpus all --shm-size=1g -v $PWD:/log \
507507
${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
508508
bash <<"EOF" |& tee test-te.log
509-
set -x
509+
set -xu -o pipefail
510+
511+
LOG_DIR=/log
510512
511513
pip install pytest-reportlog pytest-xdist
512514
# Start MPS daemon
513515
nvidia-cuda-mps-control -d
514516
# TE's default is slightly different, without the hyphen
515517
export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
516-
# 1 GPU per worker, 6 workers per GPU
517-
pytest-xdist.sh 1 3 /log/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
518+
# 1 GPU per worker, 3 workers per GPU
519+
pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
518520
## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation
519521
## into a single .jsonl file of results from multiple pytest invocations
520522
## inside the test.sh script, so it's useful even with a single worker per
521523
## device.
522-
pytest-xdist.sh 8 1 /log/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh
524+
pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh
523525
524526
# merge the log files
525527
cat \
526-
/log/pytest-report-L0-unittest.jsonl
527-
/log/pytest-report-L0-distributed-unittest.jsonl
528-
> /log/pytest-report.jsonl
528+
${LOG_DIR}/pytest-report-L0-unittest.jsonl \
529+
${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \
530+
> ${LOG_DIR}/pytest-report.jsonl
531+
529532
EOF
530533
STATISTICS_SCRIPT: |
531-
ls
532-
report_json=pytest-report-L0-unittest.jsonl
534+
report_json=pytest-report.jsonl
533535
summary_line=$(tail -n1 test-te.log)
534536
errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
535537
passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
@@ -539,13 +541,20 @@ jobs:
539541
echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
540542
echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
541543
echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
544+
545+
echo "$failed_tests tests failed"
546+
if [[ $failed_tests -gt 0 ]]; then
547+
exit 1
548+
else
549+
exit 0
550+
fi
551+
542552
TIMEOUT_MINUTES: 120
543553
ARTIFACTS: |
544554
test-te.log
545555
pytest-report.jsonl
546556
pytest-report-L0-unittest.jsonl
547557
pytest-report-L0-distributed-unittest.jsonl
548-
pytest-report-L1-distributed-unittest.jsonl
549558
550559
test-rosetta-t5x:
551560
needs: build-rosetta-t5x

.github/workflows/_transformer_engine_eks.yaml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ permissions:
2525
packages: write # to upload container
2626

2727
jobs:
28-
te-test-eks:
28+
te-test-h100:
2929
env:
3030
TE_EKS_FILES_PREFIX: .github/eks-workflow-files/transformer-engine
3131
RUN_NAME: ${{ inputs.JOB_NAME }}-${{ matrix.N_GPU }}gpu-${{ matrix.TEST }}
@@ -99,6 +99,13 @@ jobs:
9999
echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
100100
echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
101101
102+
echo "$failed_tests tests failed"
103+
if [[ $failed_tests -gt 0 ]]; then
104+
exit 1
105+
else
106+
exit 0
107+
fi
108+
102109
- name: Generate sitrep
103110
id: sitrep
104111
if: ${{ !cancelled() }}
@@ -137,8 +144,8 @@ jobs:
137144
if: ${{ !cancelled() }}
138145
uses: actions/upload-artifact@v4
139146
with:
140-
name: "artifact-multigpu-test-${{ env.RUN_NAME }}"
147+
name: "te-unit-test-H100"
141148
path: |
142149
sitrep.json
143150
badge-transformer-engine-test.json
144-
trasformer-engine-output/*
151+
transformer-engine-output/*

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,11 @@ We support and test the following JAX frameworks and model architectures. More d
7575
</a>
7676
<br>
7777
<a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-te-unit-test-A100-json">
78-
<img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-te-unit-test-A100.json&logo=nvidia&label=TransformerEngine%20A100">
78+
<img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-te-unit-test-a100.json&logo=nvidia&label=TransformerEngine%20A100">
7979
</a>
8080
<br>
8181
<a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-transformer-engine-test.json">
82-
<img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-transformer-engine-test.json&logo=nvidia&label=TransformerEngine%20H100">
82+
<img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-transformer-engine-test-json&logo=nvidia&label=TransformerEngine%20H100">
8383
</a>
8484
<br>
8585
<a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-nsys-jax-unit-test-a100-json">

0 commit comments

Comments
 (0)