1616 - ' no'
1717
1818jobs :
19- # jax-build-and-test:
20- # runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
21- # container:
22- # image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
19+ jax-build-and-test :
20+ runs-on : linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
21+ container :
22+ image : " gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
2323
24- # env:
25- # JAXCI_HERMETIC_PYTHON_VERSION: 3.11
24+ env :
25+ JAXCI_HERMETIC_PYTHON_VERSION : 3.11
2626
27- # steps:
28- # - name: Checkout JAX Fork
29- # uses: actions/checkout@v3
30- # with:
31- # repository: 'google-ml-infra/jax-fork'
32- # path: jax-fork
27+ steps :
28+ - name : Checkout JAX Fork
29+ uses : actions/checkout@v3
30+ with :
31+ repository : ' google-ml-infra/jax-fork'
32+ path : jax-fork
3333
34- # - name: Install JAX Dependencies
35- # working-directory: jax-fork
36- # run: |
37- # python -m pip install --upgrade pip
38- # pip install pytest
39- # pip install absl-py
40- # pip install "jax[cuda12_pip]" # Adjust CUDA version if needed
41- # pip install google-benchmark
42- # - name: Run JAX Multiprocess GPU Test
43- # working-directory: jax-fork
44- # continue-on-error: true
45- # run: python -m pytest tests/multiprocess_gpu_test.py
34+ - name : Install JAX Dependencies
35+ working-directory : jax-fork
36+ run : |
37+ python -m pip install --upgrade pip
38+ pip install pytest
39+ pip install absl-py
40+ pip install "jax[cuda12_pip]" # Adjust CUDA version if needed
41+ pip install google-benchmark
42+ - name : Run JAX Multiprocess GPU Test
43+ working-directory : jax-fork
44+ continue-on-error : true
45+ run : python -m pytest tests/multiprocess_gpu_test.py
4646
4747 build-xla-gpu-and-test :
4848 runs-on : linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
5656 with :
5757 repository : openxla/xla # Replace with your fork if needed
5858 path : xla
59-
60- - name : Build XLA with GPU support with RBE
61- working-directory : xla
62- continue-on-error : true
63- run : bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
64-
65- - name : Run XLA tests with GPU support with RBE
66- working-directory : xla
67- continue-on-error : true
68- run : bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
69-
70- - name : Run Profile Analysis
71- working-directory : xla
72- continue-on-error : true
73- run : bazel analyze-profile profile.json.gz
7459
7560 - name : Create results directory
7661 working-directory : xla
@@ -84,19 +69,23 @@ jobs:
8469 - name : Configure XLA
8570 working-directory : xla
8671 run : ./configure.py --backend CUDA --nccl
87-
72+
8873 - name : Set TF_CPP_MAX_VLOG_LEVEL
8974 working-directory : xla
90- run : echo " TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV
91-
92- - name : Build run_hlo_module with GPU
75+ run : export TF_CPP_MAX_VLOG_LEVEL=1
76+
77+ - name : Check TF_CPP_MAX_VLOG_LEVEL
9378 working-directory : xla
94- run : bazel build -c opt --dynamic_mode=off xla/tools:run_hlo_module --config=cuda
79+ run : echo "$TF_CPP_MAX_VLOG_LEVEL"
9580
96- - name : Wait For Connection
97- uses : google-ml-infra/actions/ci_connection@main
98- with :
99- halt-dispatch-input : ${{ inputs.halt-for-connection }}
81+ - name : Build hlo_runner_main
82+ working-directory : xla
83+ run : bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
84+
85+ # - name: Wait For Connection
86+ # uses: google-ml-infra/actions/ci_connection@main
87+ # with:
88+ # halt-dispatch-input: ${{ inputs.halt-for-connection }}
10089
10190 - name : Run HLO Module Benchmarks withg GPU in xla/tests/fuzz
10291 working-directory : xla
@@ -109,33 +98,10 @@ jobs:
10998 echo "Skipping benchmark on $file"
11099 continue
111100 fi
112- echo "Running benchmark on $file" &> results/"$file".log
113- # ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$file" &> results/"$filename".log
114- done
115- # - name: Wait For Connection
116- # uses: google-ml-infra/actions/ci_connection@main
117- # with:
118- # halt-dispatch-input: ${{ inputs.halt-for-connection }}
119- - name : Run HLO Module Benchmarks withg GPU in xla/service/gpu
120- working-directory : xla
121- continue-on-error : true
122- run : |
123- find xla/service/gpu -name "*.hlo" -print0 | while IFS= read -r -d $'\0' file; do
124- ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$file" &> results/"$(basename "$file")".log
101+ echo "Running benchmark on $file" &> results/"$filename".log
102+ ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log
125103 done
126- # - name: Build hlo_runner_main
127- # working-directory: xla
128- # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
129-
130- # - name: Run XLA GPU microbenchmarks with hlo_runner_main
131- # working-directory: xla
132- # continue-on-error: true
133- # run: |
134- # for file in xla/tools/multihost_hlo_runner/data/*.hlo; do
135- # filename=$(basename "$file")
136- # echo "Running benchmark on $file"
137- # bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main "$file"
138- # done
104+
139105 - name : Upload Results
140106 uses : actions/upload-artifact@v4
141107 with :
0 commit comments