1515 - ' yes'
1616 - ' no'
1717
18- # jobs:
19- # Tests:
20- # strategy:
21- # # Don't fail fast - want to see results for all builds even if one fails.
22- # fail-fast: false
23- # matrix:
24- # job_info: [
25- # {
26- # os: "linux-x86-n2-16",
27- # container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
28- # pretty_name: "XLA Linux x86 CPU",
29- # },
30- # {
31- # os: "linux-arm64-c4a-16",
32- # container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest",
33- # pretty_name: "XLA Linux ARM64 CPU",
34- # }
35- # ]
36- # name: ${{ matrix.job_info.pretty_name }}
37- # runs-on: ${{ matrix.job_info.os }}
38- # container: ${{ matrix.job_info.container }}
39- # defaults:
40- # run:
41- # shell: bash
42- # timeout-minutes: 30
43- # steps:
44- # - name: Print machine specs
45- # run: |
46- # lscpu
47- # free -h # Memory information
48- # df -h # Disk space information
49- # uname -a # Kernel information
50- # - name: Wait For Connection
51- # uses: google-ml-infra/actions/ci_connection@main
52- # with:
53- # halt-dispatch-input: ${{ inputs.halt-for-connection }}
54-
55- # - name: Check Python Version in Container
56- # run: python3 --version
57-
58- # - name: Install Python 3.10 if not present (IN CONTAINER)
59- # run: |
60- # if ! python3 --version > /dev/null 2>&1; then # check for python3
61- # echo "Python 3 not found, installing..."
62- # apt-get update
63- # apt-get install -y python3.10 python3-pip
64- # else
65- # echo "Python 3 found."
66- # fi
67-
68- # - name: Checkout OpenXLA
69- # uses: actions/checkout@v3
70- # with:
71- # repository: openxla/xla
72-
73- # - name: Create results directory
74- # working-directory: xla
75- # run:
76- # mkdir results
77-
78- # - name: Run setup.sh for E2E benchmarks flax_2b
79- # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
80- # timeout-minutes: 60
81- # run: |
82- # bash setup.sh
83-
84- # - name: Run run.sh for E2E benchmarks flax_2b
85- # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
86- # timeout-minutes: 30
87- # run: |
88- # bash run.sh | tee -a ../../../../../../results/flax_2b.log
89-
90- # TODO(juliagmt): Re-enable once the build is fixed.
91- # - name: Build run_hlo_module
92- # working-directory: xla
93- # run: bazel build -c opt --dynamic_mode=off //xla/tools:run_hlo_module
94-
95- # - name: Run HLO Module Benchmarks
96- # working-directory: xla
97- # continue-on-error: true
98- # run: |
99- # for file in tests/fuzz/*.hlo; do
100- # filename=$(basename "$file")
101- # # Skip expected failed hlo files.
102- # if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
103- # echo "Skipping benchmark on $file"
104- # continue
105- # fi
106- # echo "Running benchmark on $file"
107- # ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU "$file"
108- # done
109-
110- # - name: Build CPU Benchmarks
111- # working-directory: xla
112- # run: bazel build -c opt --dynamic_mode=off //xla/backends/cpu/benchmarks:*
113-
114- # - name: Run CPU benchmarks
115- # working-directory: xla
116- # continue-on-error: true
117- # run: |
118- # find ./bazel-bin/xla/backends/cpu/benchmarks/ -maxdepth 1 -type f -executable -name "*_test" -print0 | while IFS= read -r -d $'\0' benchmark; do
119- # benchmark_name=$(basename "$benchmark" | sed 's/_test$//')
120- # echo "Running benchmark: $benchmark_name"
121-
122- # # Run the benchmark with default parameters.
123- # $benchmark --benchmark_filter=".*"
124- # $benchmark --benchmark_filter=".*" > "results/$benchmark_name.log" 2>&1
125-
126- # # Check the exit code of the benchmark
127- # if [ $? -ne 0 ]; then
128- # echo "Error: Benchmark '$benchmark_name' failed. Check the log file: results/$benchmark_name.log"
129- # else
130- # echo "Benchmark '$benchmark_name' completed successfully."
131- # fi
132- # done
133- # - name: Upload Results ${{ matrix.build_target }}
134- # uses: actions/upload-artifact@v4
135- # with:
136- # name: cpu-xla-benchmarks-${{ matrix.job_info.os }}
137- # path: xla/results
13818jobs :
13919 build-xla-gpu-and-test :
14020 runs-on : " linux-x86-g2-48-l4-4gpu" # linux-x86-n2-16 # Use a GPU-enabled runner
@@ -146,13 +26,8 @@ jobs:
14626 - name : Checkout XLA
14727 uses : actions/checkout@v3
14828 with :
149- repository : openxla/xla # Replace with your fork if needed
150- - name : Checkout repository
151- uses : actions/checkout@v3
152- with :
153- repository : juliagmt-google/xla
154- ref : dev
155-
29+ repository : openxla/xla
30+
15631 - name : Print machine specs
15732 run : |
15833 lscpu
@@ -164,147 +39,58 @@ jobs:
16439 working-directory : xla
16540 run : mkdir results
16641
167- - name : Wait For Connection
168- uses : google-ml-infra/actions/ci_connection@main
169- with :
170- halt-dispatch-input : ${{ inputs.halt-for-connection }}
171- # - name: Set up Python 3.10 # Choose your desired Python version
172- # uses: actions/setup-python@v4
42+ # - name: Wait For Connection
43+ # uses: google-ml-infra/actions/ci_connection@main
17344 # with:
174- # python-version: '3.10'
175-
176- # # - name: Create and activate virtual environment
177- # # shell: bash # Force the use of bash
178- # # run: |
179- # # python -m venv xla/venv
180- # # source xla/venv/bin/activate
181- # # - name: Set up Python 3.10
182- # # uses: actions/setup-python@v4
183- # # with:
184- # # python-version: '3.10'
185-
186- # # - name: Run setup.sh for E2E benchmarks flax_2b (within venv)
187- # # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
188- # # shell: bash
189- # # run: |
190- # # bash setup.sh
191-
192- # # - name: Wait For Connection
193- # # uses: google-ml-infra/actions/ci_connection@main
194- # # with:
195- # # halt-dispatch-input: ${{ inputs.halt-for-connection }}
196-
197- # # - name: Run run.sh for E2E benchmarks flax_2b (within venv)
198- # # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
199- # # timeout-minutes: 30
200- # # shell: bash
201- # # run: |
202- # # bash run.sh > ../../../../../../results/flax_2b.log
203-
204- # # - name: Build run_hlo_module
205- # # working-directory: xla
206- # # run: bazel build -c opt --dynamic_mode=off //xla/tools:run_hlo_module
207-
208- # # - name: Run HLO Module Benchmarks
209- # # working-directory: xla
210- # # continue-on-error: true
211- # # run: |
212- # # for file in xla/tests/fuzz/*.hlo; do
213- # # filename=$(basename "$file")
214- # # # Skip expected failed hlo files.
215- # # if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
216- # # echo "Skipping benchmark on $file"
217- # # continue
218- # # fi
219- # # echo "Running benchmark on $file"
220- # # ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU "$file"
221- # # done
222-
223- # # - name: Build CPU Benchmarks
224- # # working-directory: xla
225- # # run: bazel build -c opt --dynamic_mode=off //xla/backends/cpu/benchmarks:*
226-
227- # # - name: Run CPU benchmarks
228- # # working-directory: xla
229- # # continue-on-error: true
230- # # run: |
231- # # find ./bazel-bin/xla/backends/cpu/benchmarks/ -maxdepth 1 -type f -executable -name "*_test" -print0 | while IFS= read -r -d $'\0' benchmark; do
232- # # benchmark_name=$(basename "$benchmark" | sed 's/_test$//')
233- # # echo "Running benchmark: $benchmark_name"
234-
235- # # # Run the benchmark with default parameters.
236- # # $benchmark --benchmark_filter=".*"
237- # # $benchmark --benchmark_filter=".*" > "results/$benchmark_name.log" 2>&1
238-
239- # # # Check the exit code of the benchmark
240- # # if [ $? -ne 0 ]; then
241- # # echo "Error: Benchmark '$benchmark_name' failed. Check the log file: results/$benchmark_name.log"
242- # # else
243- # # echo "Benchmark '$benchmark_name' completed successfully."
244- # # fi
245- # # done
246-
247-
248- # # - name: Wait For Connection
249- # # uses: google-ml-infra/actions/ci_connection@main
250- # # with:
251- # # halt-dispatch-input: ${{ inputs.halt-for-connection }}
252- # # - name: Get GPU spec
253- # # working-directory: xla
254- # # continue-on-error: true
255- # # run: nvidia-smi
45+ # halt-dispatch-input: ${{ inputs.halt-for-connection }
25646
257- # # - name: Configure XLA
258- # # working-directory: xla
259- # # run: ./configure.py --backend CUDA --nccl
260-
261- # # - name: Set TF_CPP_MAX_VLOG_LEVEL
262- # # working-directory: xla
263- # # run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps
47+ - name : Configure XLA
48+ working-directory : xla
49+ run : ./configure.py --backend CUDA --nccl
26450
265- # # - name: Check TF_CPP_MAX_VLOG_LEVEL
266- # # working-directory: xla
267- # # run: echo "$ TF_CPP_MAX_VLOG_LEVEL"
51+ - name : Set TF_CPP_MAX_VLOG_LEVEL
52+ working-directory : xla
53+ run : echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps
26854
269- # # - name: Build hlo_runner_main
270- # # working-directory: xla
271- # # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
55+ - name : Build hlo_runner_main
56+ working-directory : xla
57+ run : bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
27258
273- # # - name: Wait For Connection
274- # # uses: google-ml-infra/actions/ci_connection@main
275- # # with:
276- # # halt-dispatch-input: ${{ inputs.halt-for-connection }}
59+ # - name: Wait For Connection
60+ # uses: google-ml-infra/actions/ci_connection@main
61+ # with:
62+ # halt-dispatch-input: ${{ inputs.halt-for-connection }}
27763
278- # # - name: Create gpu_hlo_backend.hlo
279- # # working-directory: xla
280- # # run: |
281- # cat << EOF > gpu_hlo_backend.hlo
282- # HloModule module
283- # // CHECK: is_scheduled=true
64+ - name : Create gpu_hlo_backend.hlo
65+ working-directory : xla
66+ run : |
67+ cat << EOF > gpu_hlo_backend.hlo
68+ HloModule module
69+ // CHECK: is_scheduled=true
28470
285- # ENTRY computation {
286- # p = f32[5000,6000]{1,0} parameter(0)
287- # e = f32[5000,6000]{1,0} sqrt(p)
288- # c = f32[6000,5000] transpose(p), dimensions={1,0}
289- # r = f32[300,20,5000] reshape(c)
290- # ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
291- # }
292- # EOF
71+ ENTRY computation {
72+ p = f32[5000,6000]{1,0} parameter(0)
73+ e = f32[5000,6000]{1,0} sqrt(p)
74+ c = f32[6000,5000] transpose(p), dimensions={1,0}
75+ r = f32[300,20,5000] reshape(c)
76+ ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
77+ }
78+ EOF
29379
294- # # - name: Wait For Connection
295- # # uses: google-ml-infra/actions/ci_connection@main
296- # # with:
297- # # halt-dispatch-input: ${{ inputs.halt-for-connection }}
80+ # - name: Wait For Connection
81+ # uses: google-ml-infra/actions/ci_connection@main
82+ # with:
83+ # halt-dispatch-input: ${{ inputs.halt-for-connection }}
29884
299- # # - name: Run an HLO file
300- # # working-directory: xla
301- # # run: |
302- # # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning gpu_hlo_backend.hlo &> results/gpu_hlo_backend.log
85+ - name : Run an HLO file
86+ working-directory : xla
87+ run : |
88+ ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=resultts/xspace.pb gpu_hlo_backend.hlo &> results/gpu_hlo_backend.log
30389
304- # # - name: Wait For Connection
305- # # uses: google-ml-infra/actions/ci_connection@main
306- # # with:
307- # # halt-dispatch-input: ${{ inputs.halt-for-connection }}
90+ - name : Wait For Connection
91+ uses : google-ml-infra/actions/ci_connection@main
92+ with :
93+ halt-dispatch-input : ${{ inputs.halt-for-connection }}
30894
30995 # # - name: Download parse_xla_logs.py
31096 # # working-directory: xla
@@ -314,57 +100,9 @@ jobs:
314100 # # working-directory: xla
315101 # # run: python parse_xla_logs.py results/gpu_hlo_backend.log
316102
317- # - name: Upload Results
318- # uses: actions/upload-artifact@v4
319- # with:
320- # name: gpu-xla-benchmarks
321- # path: xla/results
322- # # # jax-build-and-test:
323- # # # runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
324- # # # container:
325- # # # image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
326-
327- # # # env:
328- # # # JAXCI_HERMETIC_PYTHON_VERSION: 3.11
329-
330- # # # steps:
331- # # # - name: Checkout JAX Fork
332- # # # uses: actions/checkout@v3
333- # # # with:
334- # # # repository: 'google-ml-infra/jax-fork'
335- # # # path: jax-fork
336-
337- # # # - name: Install JAX Dependencies
338- # # # working-directory: jax-fork
339- # # # run: |
340- # # # python -m pip install --upgrade pip
341- # # # pip install pytest
342- # # # pip install absl-py
343- # # # pip install "jax[cuda12_pip]" # Adjust CUDA version if needed
344- # # # pip install google-benchmark
345- # # # - name: Run JAX Multiprocess GPU Test
346- # # # working-directory: jax-fork
347- # # # continue-on-error: true
348- # # # run: python -m pytest tests/multiprocess_gpu_test.py
349-
350-
351- # # # - name: Run HLO Module Benchmarks withg GPU in xla/tests/fuzz
352- # # # working-directory: xla
353- # # # continue-on-error: true
354- # # # run: |
355- # # # for file in xla/tests/fuzz/*.hlo; do
356- # # # filename=$(basename "$file")
357- # # # # Skip expected failed hlo files.
358- # # # if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
359- # # # echo "Skipping benchmark on $file"
360- # # # continue
361- # # # fi
362- # # # echo "Running benchmark on $file" &> results/"$filename".log
363- # # # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log
364- # # # done
365-
366- # # # - name: Upload Results
367- # # # uses: actions/upload-artifact@v4
368- # # # with:
369- # # # name: gpu-xla-benchmarks
370- # # # path: xla/results
103+ - name : Upload Results
104+ uses : actions/upload-artifact@v4
105+ with :
106+ name : gpu-xla-benchmarks
107+ path : xla/results
108+
0 commit comments