Skip to content

Commit 5ba454c

Browse files
Update benchmarks.yml
1 parent a1c684d commit 5ba454c

File tree

1 file changed

+126
-126
lines changed

1 file changed

+126
-126
lines changed

.github/workflows/benchmarks.yml

Lines changed: 126 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -7,146 +7,146 @@ on:
77
workflow_dispatch: # Allows manual triggering
88
schedule:
99
- cron: '0 */6 * * *' # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18)
10-
# workflow_dispatch:
11-
# inputs:
12-
# halt-for-connection:
13-
# description: 'Should this workflow run wait for a remote connection?'
14-
# type: choice
15-
# required: true
16-
# default: 'no'
17-
# options:
18-
# - 'yes'
19-
# - 'no'
10+
workflow_dispatch:
11+
inputs:
12+
halt-for-connection:
13+
description: 'Should this workflow run wait for a remote connection?'
14+
type: choice
15+
required: true
16+
default: 'no'
17+
options:
18+
- 'yes'
19+
- 'no'
2020

21-
# jobs:
22-
# build-xla-gpu-and-test:
23-
# runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner
24-
# container:
25-
# image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
26-
# options: --gpus all --privileged # Might need privileged mode, use with caution
21+
jobs:
22+
build-xla-gpu-and-test:
23+
runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner
24+
container:
25+
image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
26+
options: --gpus all --privileged # Might need privileged mode, use with caution
2727

28-
# steps:
29-
# - name: Checkout XLA
30-
# uses: actions/checkout@v3
31-
# with:
32-
# repository: juliagmt-google/xla
28+
steps:
29+
- name: Checkout XLA
30+
uses: actions/checkout@v3
31+
with:
32+
repository: openxla/xla
3333

34-
# - name: Print machine specs
35-
# run: |
36-
# lscpu
37-
# free -h # Memory information
38-
# df -h # Disk space information
39-
# uname -a # Kernel information
34+
- name: Print machine specs
35+
run: |
36+
lscpu
37+
free -h # Memory information
38+
df -h # Disk space information
39+
uname -a # Kernel information
4040
41-
# - name: Create results directory
42-
# working-directory: xla
43-
# run: mkdir results
41+
- name: Create results directory
42+
working-directory: xla
43+
run: mkdir results
4444

45-
# # - name: Wait For Connection
46-
# # uses: google-ml-infra/actions/ci_connection@main
47-
# # with:
48-
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
45+
- name: Wait For Connection
46+
uses: google-ml-infra/actions/ci_connection@main
47+
with:
48+
halt-dispatch-input: ${{ inputs.halt-for-connection }}
4949

50-
# - name: Configure XLA
51-
# working-directory: xla
52-
# run: |
53-
# cd ..
54-
# ls
55-
# ./configure.py --backend CUDA --nccl
50+
# - name: Configure XLA
51+
# working-directory: xla
52+
# run: |
53+
# cd ..
54+
# ls
55+
# ./configure.py --backend CUDA --nccl
5656

57-
# - name: Set TF_CPP_MAX_VLOG_LEVEL
58-
# working-directory: xla
59-
# run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps
57+
# - name: Set TF_CPP_MAX_VLOG_LEVEL
58+
# working-directory: xla
59+
# run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps
6060

61-
# - name: Build hlo_runner_main
62-
# working-directory: xla
63-
# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
61+
# - name: Build hlo_runner_main
62+
# working-directory: xla
63+
# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
6464

65-
# # - name: Wait For Connection
66-
# # uses: google-ml-infra/actions/ci_connection@main
67-
# # with:
68-
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
65+
# # - name: Wait For Connection
66+
# # uses: google-ml-infra/actions/ci_connection@main
67+
# # with:
68+
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
6969

70-
# # - name: Wait For Connection
71-
# # uses: google-ml-infra/actions/ci_connection@main
72-
# # with:
73-
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
70+
# # - name: Wait For Connection
71+
# # uses: google-ml-infra/actions/ci_connection@main
72+
# # with:
73+
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
7474

75-
# - name: Run an HLO file
76-
# working-directory: xla
77-
# run: |
78-
# cd ..
79-
# ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log
75+
# - name: Run an HLO file
76+
# working-directory: xla
77+
# run: |
78+
# cd ..
79+
# ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log
8080

81-
# # - name: Wait For Connection
82-
# # uses: google-ml-infra/actions/ci_connection@main
83-
# # with:
84-
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
81+
# # - name: Wait For Connection
82+
# # uses: google-ml-infra/actions/ci_connection@main
83+
# # with:
84+
# # halt-dispatch-input: ${{ inputs.halt-for-connection }}
8585

86-
# # # - name: Download parse_xla_logs.py
87-
# # # working-directory: xla
88-
# # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py
86+
# # # - name: Download parse_xla_logs.py
87+
# # # working-directory: xla
88+
# # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py
8989

90-
# # # - name: Parse XLA logs
91-
# # # working-directory: xla
92-
# # # run: python parse_xla_logs.py results/gpu_hlo_backend.log
90+
# # # - name: Parse XLA logs
91+
# # # working-directory: xla
92+
# # # run: python parse_xla_logs.py results/gpu_hlo_backend.log
9393

94-
# - name: Upload Results
95-
# uses: actions/upload-artifact@v4
96-
# with:
97-
# name: gpu-xla-benchmarks
98-
# path: xla/results
94+
# - name: Upload Results
95+
# uses: actions/upload-artifact@v4
96+
# with:
97+
# name: gpu-xla-benchmarks
98+
# path: xla/results
9999

100100

101-
jobs:
102-
Tests:
103-
strategy:
104-
# Don't fail fast - want to see results for all builds even if one fails.
105-
fail-fast: false
106-
matrix:
107-
job_info:
108-
- os: "linux-x86-g2-48-l4-4gpu"
109-
image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
110-
pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs"
111-
# Expect more GPU types in the future.
112-
name: ${{ matrix.job_info.pretty_name }}
113-
runs-on: ${{ matrix.job_info.os }}
114-
container: ${{ matrix.job_info.container }}
115-
defaults:
116-
run:
117-
shell: bash
118-
timeout-minutes: 360
119-
steps:
120-
- name: Checkout XLA
121-
uses: actions/checkout@v4 # v4.1.1
122-
with:
123-
repository: openxla/xla
124-
- name: Wait For Connection
125-
uses: google-ml-infra/actions/ci_connection@main
126-
with:
127-
halt-dispatch-input: ${{ inputs.halt-for-connection }}
128-
- name: Print machine specs
129-
run: |
130-
nvidia-smi
131-
free -h # Memory information
132-
df -h # Disk space information
133-
uname -a # Kernel information
134-
- name: Create results directory
135-
run: mkdir results
136-
- name: Configure XLA for GPU backend
137-
run: ./configure.py --backend CUDA --nccl
138-
- name: Set TF_CPP_MAX_VLOG_LEVEL
139-
env:
140-
TF_CPP_MAX_VLOG_LEVEL: 1
141-
run: |
142-
echo "TF_CPP_MAX_VLOG_LEVEL is: $TF_CPP_MAX_VLOG_LEVEL"
143-
- name: Build hlo_runner_main
144-
run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
145-
# TODO(juliagmt): Add more performance-criticalHLOs to benchmark.
146-
- name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
147-
run: ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
148-
- name: Upload XSpace
149-
uses: actions/upload-artifact@v4 # v4.1.1
150-
with:
151-
name: gpu-xla-benchmarks-xspace
152-
path: xla/results/xspace.pbtxt
101+
# jobs:
102+
# Tests:
103+
# strategy:
104+
# # Don't fail fast - want to see results for all builds even if one fails.
105+
# fail-fast: false
106+
# matrix:
107+
# job_info:
108+
# - os: "linux-x86-g2-48-l4-4gpu"
109+
# container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
110+
# pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs"
111+
# # Expect more GPU types in the future.
112+
# name: ${{ matrix.job_info.pretty_name }}
113+
# runs-on: ${{ matrix.job_info.os }}
114+
# container: ${{ matrix.job_info.container }}
115+
# defaults:
116+
# run:
117+
# shell: bash
118+
# timeout-minutes: 360
119+
# steps:
120+
# - name: Checkout XLA
121+
# uses: actions/checkout@v4 # v4.1.1
122+
# with:
123+
# repository: openxla/xla
124+
# - name: Wait For Connection
125+
# uses: google-ml-infra/actions/ci_connection@main
126+
# with:
127+
# halt-dispatch-input: ${{ inputs.halt-for-connection }}
128+
# - name: Print machine specs
129+
# run: |
130+
# nvidia-smi
131+
# free -h # Memory information
132+
# df -h # Disk space information
133+
# uname -a # Kernel information
134+
# - name: Create results directory
135+
# run: mkdir results
136+
# - name: Configure XLA for GPU backend
137+
# run: ./configure.py --backend CUDA --nccl
138+
# - name: Set TF_CPP_MAX_VLOG_LEVEL
139+
# env:
140+
# TF_CPP_MAX_VLOG_LEVEL: 1
141+
# run: |
142+
# echo "TF_CPP_MAX_VLOG_LEVEL is: $TF_CPP_MAX_VLOG_LEVEL"
143+
# - name: Build hlo_runner_main
144+
# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
145+
# # TODO(juliagmt): Add more performance-criticalHLOs to benchmark.
146+
# - name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
147+
# run: ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
148+
# - name: Upload XSpace
149+
# uses: actions/upload-artifact@v4 # v4.1.1
150+
# with:
151+
# name: gpu-xla-benchmarks-xspace
152+
# path: xla/results/xspace.pbtxt

0 commit comments

Comments
 (0)