Skip to content

Commit 2222254

Browse files
Update benchmarks.yml
1 parent a69190b commit 2222254

File tree

1 file changed

+40
-74
lines changed

1 file changed

+40
-74
lines changed

.github/workflows/benchmarks.yml

Lines changed: 40 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -16,33 +16,33 @@ on:
1616
- 'no'
1717

1818
jobs:
19-
# jax-build-and-test:
20-
# runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
21-
# container:
22-
# image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
19+
jax-build-and-test:
20+
runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
21+
container:
22+
image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
2323

24-
# env:
25-
# JAXCI_HERMETIC_PYTHON_VERSION: 3.11
24+
env:
25+
JAXCI_HERMETIC_PYTHON_VERSION: 3.11
2626

27-
# steps:
28-
# - name: Checkout JAX Fork
29-
# uses: actions/checkout@v3
30-
# with:
31-
# repository: 'google-ml-infra/jax-fork'
32-
# path: jax-fork
27+
steps:
28+
- name: Checkout JAX Fork
29+
uses: actions/checkout@v3
30+
with:
31+
repository: 'google-ml-infra/jax-fork'
32+
path: jax-fork
3333

34-
# - name: Install JAX Dependencies
35-
# working-directory: jax-fork
36-
# run: |
37-
# python -m pip install --upgrade pip
38-
# pip install pytest
39-
# pip install absl-py
40-
# pip install "jax[cuda12_pip]" # Adjust CUDA version if needed
41-
# pip install google-benchmark
42-
# - name: Run JAX Multiprocess GPU Test
43-
# working-directory: jax-fork
44-
# continue-on-error: true
45-
# run: python -m pytest tests/multiprocess_gpu_test.py
34+
- name: Install JAX Dependencies
35+
working-directory: jax-fork
36+
run: |
37+
python -m pip install --upgrade pip
38+
pip install pytest
39+
pip install absl-py
40+
pip install "jax[cuda12_pip]" # Adjust CUDA version if needed
41+
pip install google-benchmark
42+
- name: Run JAX Multiprocess GPU Test
43+
working-directory: jax-fork
44+
continue-on-error: true
45+
run: python -m pytest tests/multiprocess_gpu_test.py
4646

4747
build-xla-gpu-and-test:
4848
runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
@@ -56,21 +56,6 @@ jobs:
5656
with:
5757
repository: openxla/xla # Replace with your fork if needed
5858
path: xla
59-
60-
- name: Build XLA with GPU support with RBE
61-
working-directory: xla
62-
continue-on-error: true
63-
run: bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
64-
65-
- name: Run XLA tests with GPU support with RBE
66-
working-directory: xla
67-
continue-on-error: true
68-
run: bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
69-
70-
- name: Run Profile Analysis
71-
working-directory: xla
72-
continue-on-error: true
73-
run: bazel analyze-profile profile.json.gz
7459

7560
- name: Create results directory
7661
working-directory: xla
@@ -84,19 +69,23 @@ jobs:
8469
- name: Configure XLA
8570
working-directory: xla
8671
run: ./configure.py --backend CUDA --nccl
87-
72+
8873
- name: Set TF_CPP_MAX_VLOG_LEVEL
8974
working-directory: xla
90-
run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV
91-
92-
- name: Build run_hlo_module with GPU
75+
run: export TF_CPP_MAX_VLOG_LEVEL=1
76+
77+
- name: Check TF_CPP_MAX_VLOG_LEVEL
9378
working-directory: xla
94-
run: bazel build -c opt --dynamic_mode=off xla/tools:run_hlo_module --config=cuda
79+
run: echo "$TF_CPP_MAX_VLOG_LEVEL"
9580

96-
- name: Wait For Connection
97-
uses: google-ml-infra/actions/ci_connection@main
98-
with:
99-
halt-dispatch-input: ${{ inputs.halt-for-connection }}
81+
- name: Build hlo_runner_main
82+
working-directory: xla
83+
run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
84+
85+
# - name: Wait For Connection
86+
# uses: google-ml-infra/actions/ci_connection@main
87+
# with:
88+
# halt-dispatch-input: ${{ inputs.halt-for-connection }}
10089

10190
- name: Run HLO Module Benchmarks withg GPU in xla/tests/fuzz
10291
working-directory: xla
@@ -109,33 +98,10 @@ jobs:
10998
echo "Skipping benchmark on $file"
11099
continue
111100
fi
112-
echo "Running benchmark on $file" &> results/"$file".log
113-
# ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$file" &> results/"$filename".log
114-
done
115-
# - name: Wait For Connection
116-
# uses: google-ml-infra/actions/ci_connection@main
117-
# with:
118-
# halt-dispatch-input: ${{ inputs.halt-for-connection }}
119-
- name: Run HLO Module Benchmarks withg GPU in xla/service/gpu
120-
working-directory: xla
121-
continue-on-error: true
122-
run: |
123-
find xla/service/gpu -name "*.hlo" -print0 | while IFS= read -r -d $'\0' file; do
124-
./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$file" &> results/"$(basename "$file")".log
101+
echo "Running benchmark on $file" &> results/"$filename".log
102+
./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log
125103
done
126-
# - name: Build hlo_runner_main
127-
# working-directory: xla
128-
# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
129-
130-
# - name: Run XLA GPU microbenchmarks with hlo_runner_main
131-
# working-directory: xla
132-
# continue-on-error: true
133-
# run: |
134-
# for file in xla/tools/multihost_hlo_runner/data/*.hlo; do
135-
# filename=$(basename "$file")
136-
# echo "Running benchmark on $file"
137-
# bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main "$file"
138-
# done
104+
139105
- name: Upload Results
140106
uses: actions/upload-artifact@v4
141107
with:

0 commit comments

Comments
 (0)