|
7 | 7 | workflow_dispatch: # Allows manual triggering |
8 | 8 | schedule: |
9 | 9 | - cron: '0 */6 * * *' # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18) |
10 | | - # workflow_dispatch: |
11 | | - # inputs: |
12 | | - # halt-for-connection: |
13 | | - # description: 'Should this workflow run wait for a remote connection?' |
14 | | - # type: choice |
15 | | - # required: true |
16 | | - # default: 'no' |
17 | | - # options: |
18 | | - # - 'yes' |
19 | | - # - 'no' |
| 10 | + workflow_dispatch: |
| 11 | + inputs: |
| 12 | + halt-for-connection: |
| 13 | + description: 'Should this workflow run wait for a remote connection?' |
| 14 | + type: choice |
| 15 | + required: true |
| 16 | + default: 'no' |
| 17 | + options: |
| 18 | + - 'yes' |
| 19 | + - 'no' |
20 | 20 |
|
21 | | -# jobs: |
22 | | -# build-xla-gpu-and-test: |
23 | | -# runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner |
24 | | -# container: |
25 | | -# image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" |
26 | | -# options: --gpus all --privileged # Might need privileged mode, use with caution |
| 21 | +jobs: |
| 22 | + build-xla-gpu-and-test: |
| 23 | + runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner |
| 24 | + container: |
| 25 | + image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" |
| 26 | + options: --gpus all --privileged # Might need privileged mode, use with caution |
27 | 27 |
|
28 | | -# steps: |
29 | | -# - name: Checkout XLA |
30 | | -# uses: actions/checkout@v3 |
31 | | -# with: |
32 | | -# repository: juliagmt-google/xla |
| 28 | + steps: |
| 29 | + - name: Checkout XLA |
| 30 | + uses: actions/checkout@v3 |
| 31 | + with: |
| 32 | + repository: openxla/xla |
33 | 33 |
|
34 | | -# - name: Print machine specs |
35 | | -# run: | |
36 | | -# lscpu |
37 | | -# free -h # Memory information |
38 | | -# df -h # Disk space information |
39 | | -# uname -a # Kernel information |
| 34 | + - name: Print machine specs |
| 35 | + run: | |
| 36 | + lscpu |
| 37 | + free -h # Memory information |
| 38 | + df -h # Disk space information |
| 39 | + uname -a # Kernel information |
40 | 40 | |
41 | | -# - name: Create results directory |
42 | | -# working-directory: xla |
43 | | -# run: mkdir results |
| 41 | + - name: Create results directory |
| 42 | + working-directory: xla |
| 43 | + run: mkdir results |
44 | 44 |
|
45 | | -# # - name: Wait For Connection |
46 | | -# # uses: google-ml-infra/actions/ci_connection@main |
47 | | -# # with: |
48 | | -# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 45 | + - name: Wait For Connection |
| 46 | + uses: google-ml-infra/actions/ci_connection@main |
| 47 | + with: |
| 48 | + halt-dispatch-input: ${{ inputs.halt-for-connection }} |
49 | 49 |
|
50 | | -# - name: Configure XLA |
51 | | -# working-directory: xla |
52 | | -# run: | |
53 | | -# cd .. |
54 | | -# ls |
55 | | -# ./configure.py --backend CUDA --nccl |
| 50 | + # - name: Configure XLA |
| 51 | + # working-directory: xla |
| 52 | + # run: | |
| 53 | + # cd .. |
| 54 | + # ls |
| 55 | + # ./configure.py --backend CUDA --nccl |
56 | 56 |
|
57 | | -# - name: Set TF_CPP_MAX_VLOG_LEVEL |
58 | | -# working-directory: xla |
59 | | -# run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps |
| 57 | + # - name: Set TF_CPP_MAX_VLOG_LEVEL |
| 58 | + # working-directory: xla |
| 59 | + # run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps |
60 | 60 |
|
61 | | -# - name: Build hlo_runner_main |
62 | | -# working-directory: xla |
63 | | -# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main |
| 61 | + # - name: Build hlo_runner_main |
| 62 | + # working-directory: xla |
| 63 | + # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main |
64 | 64 |
|
65 | | -# # - name: Wait For Connection |
66 | | -# # uses: google-ml-infra/actions/ci_connection@main |
67 | | -# # with: |
68 | | -# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 65 | + # # - name: Wait For Connection |
| 66 | + # # uses: google-ml-infra/actions/ci_connection@main |
| 67 | + # # with: |
| 68 | + # # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
69 | 69 |
|
70 | | -# # - name: Wait For Connection |
71 | | -# # uses: google-ml-infra/actions/ci_connection@main |
72 | | -# # with: |
73 | | -# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 70 | + # # - name: Wait For Connection |
| 71 | + # # uses: google-ml-infra/actions/ci_connection@main |
| 72 | + # # with: |
| 73 | + # # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
74 | 74 |
|
75 | | -# - name: Run an HLO file |
76 | | -# working-directory: xla |
77 | | -# run: | |
78 | | -# cd .. |
79 | | -# ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log |
| 75 | + # - name: Run an HLO file |
| 76 | + # working-directory: xla |
| 77 | + # run: | |
| 78 | + # cd .. |
| 79 | + # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log |
80 | 80 |
|
81 | | -# # - name: Wait For Connection |
82 | | -# # uses: google-ml-infra/actions/ci_connection@main |
83 | | -# # with: |
84 | | -# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 81 | + # # - name: Wait For Connection |
| 82 | + # # uses: google-ml-infra/actions/ci_connection@main |
| 83 | + # # with: |
| 84 | + # # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
85 | 85 |
|
86 | | -# # # - name: Download parse_xla_logs.py |
87 | | -# # # working-directory: xla |
88 | | -# # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py |
| 86 | + # # # - name: Download parse_xla_logs.py |
| 87 | + # # # working-directory: xla |
| 88 | + # # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py |
89 | 89 |
|
90 | | -# # # - name: Parse XLA logs |
91 | | -# # # working-directory: xla |
92 | | -# # # run: python parse_xla_logs.py results/gpu_hlo_backend.log |
| 90 | + # # # - name: Parse XLA logs |
| 91 | + # # # working-directory: xla |
| 92 | + # # # run: python parse_xla_logs.py results/gpu_hlo_backend.log |
93 | 93 |
|
94 | | -# - name: Upload Results |
95 | | -# uses: actions/upload-artifact@v4 |
96 | | -# with: |
97 | | -# name: gpu-xla-benchmarks |
98 | | -# path: xla/results |
| 94 | + # - name: Upload Results |
| 95 | + # uses: actions/upload-artifact@v4 |
| 96 | + # with: |
| 97 | + # name: gpu-xla-benchmarks |
| 98 | + # path: xla/results |
99 | 99 |
|
100 | 100 |
|
101 | | -jobs: |
102 | | - Tests: |
103 | | - strategy: |
104 | | - # Don't fail fast - want to see results for all builds even if one fails. |
105 | | - fail-fast: false |
106 | | - matrix: |
107 | | - job_info: |
108 | | - - os: "linux-x86-g2-48-l4-4gpu" |
109 | | - image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" |
110 | | - pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs" |
111 | | - # Expect more GPU types in the future. |
112 | | - name: ${{ matrix.job_info.pretty_name }} |
113 | | - runs-on: ${{ matrix.job_info.os }} |
114 | | - container: ${{ matrix.job_info.container }} |
115 | | - defaults: |
116 | | - run: |
117 | | - shell: bash |
118 | | - timeout-minutes: 360 |
119 | | - steps: |
120 | | - - name: Checkout XLA |
121 | | - uses: actions/checkout@v4 # v4.1.1 |
122 | | - with: |
123 | | - repository: openxla/xla |
124 | | - - name: Wait For Connection |
125 | | - uses: google-ml-infra/actions/ci_connection@main |
126 | | - with: |
127 | | - halt-dispatch-input: ${{ inputs.halt-for-connection }} |
128 | | - - name: Print machine specs |
129 | | - run: | |
130 | | - nvidia-smi |
131 | | - free -h # Memory information |
132 | | - df -h # Disk space information |
133 | | - uname -a # Kernel information |
134 | | - - name: Create results directory |
135 | | - run: mkdir results |
136 | | - - name: Configure XLA for GPU backend |
137 | | - run: ./configure.py --backend CUDA --nccl |
138 | | - - name: Set TF_CPP_MAX_VLOG_LEVEL |
139 | | - env: |
140 | | - TF_CPP_MAX_VLOG_LEVEL: 1 |
141 | | - run: | |
142 | | - echo "TF_CPP_MAX_VLOG_LEVEL is: $TF_CPP_MAX_VLOG_LEVEL" |
143 | | - - name: Build hlo_runner_main |
144 | | - run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main |
145 | | - # TODO(juliagmt): Add more performance-criticalHLOs to benchmark. |
146 | | - - name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo |
147 | | - run: ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo |
148 | | - - name: Upload XSpace |
149 | | - uses: actions/upload-artifact@v4 # v4.1.1 |
150 | | - with: |
151 | | - name: gpu-xla-benchmarks-xspace |
152 | | - path: xla/results/xspace.pbtxt |
| 101 | +# jobs: |
| 102 | +# Tests: |
| 103 | +# strategy: |
| 104 | +# # Don't fail fast - want to see results for all builds even if one fails. |
| 105 | +# fail-fast: false |
| 106 | +# matrix: |
| 107 | +# job_info: |
| 108 | +# - os: "linux-x86-g2-48-l4-4gpu" |
| 109 | +# container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest", |
| 110 | +# pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs" |
| 111 | +# # Expect more GPU types in the future. |
| 112 | +# name: ${{ matrix.job_info.pretty_name }} |
| 113 | +# runs-on: ${{ matrix.job_info.os }} |
| 114 | +# container: ${{ matrix.job_info.container }} |
| 115 | +# defaults: |
| 116 | +# run: |
| 117 | +# shell: bash |
| 118 | +# timeout-minutes: 360 |
| 119 | +# steps: |
| 120 | +# - name: Checkout XLA |
| 121 | +# uses: actions/checkout@v4 # v4.1.1 |
| 122 | +# with: |
| 123 | +# repository: openxla/xla |
| 124 | +# - name: Wait For Connection |
| 125 | +# uses: google-ml-infra/actions/ci_connection@main |
| 126 | +# with: |
| 127 | +# halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 128 | +# - name: Print machine specs |
| 129 | +# run: | |
| 130 | +# nvidia-smi |
| 131 | +# free -h # Memory information |
| 132 | +# df -h # Disk space information |
| 133 | +# uname -a # Kernel information |
| 134 | +# - name: Create results directory |
| 135 | +# run: mkdir results |
| 136 | +# - name: Configure XLA for GPU backend |
| 137 | +# run: ./configure.py --backend CUDA --nccl |
| 138 | +# - name: Set TF_CPP_MAX_VLOG_LEVEL |
| 139 | +# env: |
| 140 | +# TF_CPP_MAX_VLOG_LEVEL: 1 |
| 141 | +# run: | |
| 142 | +# echo "TF_CPP_MAX_VLOG_LEVEL is: $TF_CPP_MAX_VLOG_LEVEL" |
| 143 | +# - name: Build hlo_runner_main |
| 144 | +# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main |
| 145 | +# # TODO(juliagmt): Add more performance-criticalHLOs to benchmark. |
| 146 | +# - name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo |
| 147 | +# run: ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo |
| 148 | +# - name: Upload XSpace |
| 149 | +# uses: actions/upload-artifact@v4 # v4.1.1 |
| 150 | +# with: |
| 151 | +# name: gpu-xla-benchmarks-xspace |
| 152 | +# path: xla/results/xspace.pbtxt |
0 commit comments