Skip to content

Commit 5805ee6

Browse files
Update benchmarks.yml
1 parent 9b9830f commit 5805ee6

File tree

1 file changed

+114
-61
lines changed

1 file changed

+114
-61
lines changed

.github/workflows/benchmarks.yml

Lines changed: 114 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -18,82 +18,135 @@ on:
1818
# - 'yes'
1919
# - 'no'
2020

21-
jobs:
22-
build-xla-gpu-and-test:
23-
runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner
24-
container:
25-
image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
26-
options: --gpus all --privileged # Might need privileged mode, use with caution
21+
# jobs:
22+
# build-xla-gpu-and-test:
23+
# runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner
24+
# container:
25+
# image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
26+
# options: --gpus all --privileged # Might need privileged mode, use with caution
2727

28+
# steps:
29+
# - name: Checkout XLA
30+
# uses: actions/checkout@v3
31+
# with:
32+
# repository: juliagmt-google/xla
33+
34+
# - name: Print machine specs
35+
# run: |
36+
# lscpu
37+
# free -h # Memory information
38+
# df -h # Disk space information
39+
# uname -a # Kernel information
40+
41+
# - name: Create results directory
42+
# working-directory: xla
43+
# run: mkdir results
44+
45+
# # - name: Wait For Connection
46+
# # uses: google-ml-infra/actions/ci_connection@main
47+
# # with:
48+
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
49+
50+
# - name: Configure XLA
51+
# working-directory: xla
52+
# run: |
53+
# cd ..
54+
# ls
55+
# ./configure.py --backend CUDA --nccl
56+
57+
# - name: Set TF_CPP_MAX_VLOG_LEVEL
58+
# working-directory: xla
59+
# run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps
60+
61+
# - name: Build hlo_runner_main
62+
# working-directory: xla
63+
# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
64+
65+
# # - name: Wait For Connection
66+
# # uses: google-ml-infra/actions/ci_connection@main
67+
# # with:
68+
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
69+
70+
# # - name: Wait For Connection
71+
# # uses: google-ml-infra/actions/ci_connection@main
72+
# # with:
73+
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
74+
75+
# - name: Run an HLO file
76+
# working-directory: xla
77+
# run: |
78+
# cd ..
79+
# ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log
80+
81+
# # - name: Wait For Connection
82+
# # uses: google-ml-infra/actions/ci_connection@main
83+
# # with:
84+
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
85+
86+
# # # - name: Download parse_xla_logs.py
87+
# # # working-directory: xla
88+
# # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py
89+
90+
# # # - name: Parse XLA logs
91+
# # # working-directory: xla
92+
# # # run: python parse_xla_logs.py results/gpu_hlo_backend.log
93+
94+
# - name: Upload Results
95+
# uses: actions/upload-artifact@v4
96+
# with:
97+
# name: gpu-xla-benchmarks
98+
# path: xla/results
99+
100+
101+
jobs:
102+
Tests:
103+
strategy:
104+
# Don't fail fast - want to see results for all builds even if one fails.
105+
fail-fast: false
106+
matrix:
107+
job_info: [
108+
{
109+
os: "linux-x86-g2-48-l4-4gpu",
110+
container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
111+
pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs",
112+
},
113+
# Expect more GPU types in the future.
114+
]
115+
name: ${{ matrix.job_info.pretty_name }}
116+
runs-on: ${{ matrix.job_info.os }}
117+
container: ${{ matrix.job_info.container }}
118+
defaults:
119+
run:
120+
shell: bash
121+
timeout-minutes: 360
28122
steps:
29123
- name: Checkout XLA
30-
uses: actions/checkout@v3
31-
with:
32-
repository: juliagmt-google/xla
33-
124+
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
125+
34126
- name: Print machine specs
35127
run: |
36-
lscpu
128+
nvidia-smi
37129
free -h # Memory information
38130
df -h # Disk space information
39131
uname -a # Kernel information
40-
41132
- name: Create results directory
42-
working-directory: xla
43133
run: mkdir results
44-
45-
# - name: Wait For Connection
46-
# uses: google-ml-infra/actions/ci_connection@main
47-
# with:
48-
# halt-dispatch-input: ${{ inputs.halt-for-connection }}
49-
50-
- name: Configure XLA
51-
working-directory: xla
134+
135+
- name: Configure XLA for GPU backend
52136
run: |
53-
cd ..
54-
ls
55137
./configure.py --backend CUDA --nccl
56-
57138
- name: Set TF_CPP_MAX_VLOG_LEVEL
58-
working-directory: xla
59-
run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps
139+
env:
140+
TF_CPP_MAX_VLOG_LEVEL: 1
60141

61142
- name: Build hlo_runner_main
62-
working-directory: xla
63143
run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
64-
65-
# - name: Wait For Connection
66-
# uses: google-ml-infra/actions/ci_connection@main
67-
# with:
68-
# halt-dispatch-input: ${{ inputs.halt-for-connection }}
69-
70-
# - name: Wait For Connection
71-
# uses: google-ml-infra/actions/ci_connection@main
72-
# with:
73-
# halt-dispatch-input: ${{ inputs.halt-for-connection }}
74-
75-
- name: Run an HLO file
76-
working-directory: xla
77-
run: |
78-
cd ..
79-
./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log
80-
81-
# - name: Wait For Connection
82-
# uses: google-ml-infra/actions/ci_connection@main
83-
# with:
84-
# halt-dispatch-input: ${{ inputs.halt-for-connection }}
85-
86-
# # - name: Download parse_xla_logs.py
87-
# # working-directory: xla
88-
# # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py
144+
# TODO(juliagmt): Add more performance-criticalHLOs to benchmark.
145+
- name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
146+
./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
89147

90-
# # - name: Parse XLA logs
91-
# # working-directory: xla
92-
# # run: python parse_xla_logs.py results/gpu_hlo_backend.log
93-
94-
- name: Upload Results
95-
uses: actions/upload-artifact@v4
148+
- name: Upload XSpace
149+
uses: actions/upload-artifact@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
96150
with:
97-
name: gpu-xla-benchmarks
98-
path: xla/results
99-
151+
name: gpu-xla-benchmarks-xspace
152+
path: xla/results/xspace.pbtxt

0 commit comments

Comments
 (0)