|
18 | 18 | # - 'yes' |
19 | 19 | # - 'no' |
20 | 20 |
|
21 | | -jobs: |
22 | | - build-xla-gpu-and-test: |
23 | | - runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner |
24 | | - container: |
25 | | - image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" |
26 | | - options: --gpus all --privileged # Might need privileged mode, use with caution |
| 21 | +# jobs: |
| 22 | +# build-xla-gpu-and-test: |
| 23 | +# runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner |
| 24 | +# container: |
| 25 | +# image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" |
| 26 | +# options: --gpus all --privileged # Might need privileged mode, use with caution |
27 | 27 |
|
| 28 | +# steps: |
| 29 | +# - name: Checkout XLA |
| 30 | +# uses: actions/checkout@v3 |
| 31 | +# with: |
| 32 | +# repository: juliagmt-google/xla |
| 33 | + |
| 34 | +# - name: Print machine specs |
| 35 | +# run: | |
| 36 | +# lscpu |
| 37 | +# free -h # Memory information |
| 38 | +# df -h # Disk space information |
| 39 | +# uname -a # Kernel information |
| 40 | + |
| 41 | +# - name: Create results directory |
| 42 | +# working-directory: xla |
| 43 | +# run: mkdir results |
| 44 | + |
| 45 | +# # - name: Wait For Connection |
| 46 | +# # uses: google-ml-infra/actions/ci_connection@main |
| 47 | +# # with: |
| 48 | +# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 49 | + |
| 50 | +# - name: Configure XLA |
| 51 | +# working-directory: xla |
| 52 | +# run: | |
| 53 | +# cd .. |
| 54 | +# ls |
| 55 | +# ./configure.py --backend CUDA --nccl |
| 56 | + |
| 57 | +# - name: Set TF_CPP_MAX_VLOG_LEVEL |
| 58 | +# working-directory: xla |
| 59 | +# run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps |
| 60 | + |
| 61 | +# - name: Build hlo_runner_main |
| 62 | +# working-directory: xla |
| 63 | +# run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main |
| 64 | + |
| 65 | +# # - name: Wait For Connection |
| 66 | +# # uses: google-ml-infra/actions/ci_connection@main |
| 67 | +# # with: |
| 68 | +# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 69 | + |
| 70 | +# # - name: Wait For Connection |
| 71 | +# # uses: google-ml-infra/actions/ci_connection@main |
| 72 | +# # with: |
| 73 | +# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 74 | + |
| 75 | +# - name: Run an HLO file |
| 76 | +# working-directory: xla |
| 77 | +# run: | |
| 78 | +# cd .. |
| 79 | +# ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log |
| 80 | + |
| 81 | +# # - name: Wait For Connection |
| 82 | +# # uses: google-ml-infra/actions/ci_connection@main |
| 83 | +# # with: |
| 84 | +# # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
| 85 | + |
| 86 | +# # # - name: Download parse_xla_logs.py |
| 87 | +# # # working-directory: xla |
| 88 | +# # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py |
| 89 | + |
| 90 | +# # # - name: Parse XLA logs |
| 91 | +# # # working-directory: xla |
| 92 | +# # # run: python parse_xla_logs.py results/gpu_hlo_backend.log |
| 93 | + |
| 94 | +# - name: Upload Results |
| 95 | +# uses: actions/upload-artifact@v4 |
| 96 | +# with: |
| 97 | +# name: gpu-xla-benchmarks |
| 98 | +# path: xla/results |
| 99 | + |
| 100 | + |
| 101 | +jobs: |
| 102 | + Tests: |
| 103 | + strategy: |
| 104 | + # Don't fail fast - want to see results for all builds even if one fails. |
| 105 | + fail-fast: false |
| 106 | + matrix: |
| 107 | + job_info: [ |
| 108 | + { |
| 109 | + os: "linux-x86-g2-48-l4-4gpu", |
| 110 | + container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest", |
| 111 | + pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs", |
| 112 | + }, |
| 113 | + # Expect more GPU types in the future. |
| 114 | + ] |
| 115 | + name: ${{ matrix.job_info.pretty_name }} |
| 116 | + runs-on: ${{ matrix.job_info.os }} |
| 117 | + container: ${{ matrix.job_info.container }} |
| 118 | + defaults: |
| 119 | + run: |
| 120 | + shell: bash |
| 121 | + timeout-minutes: 360 |
28 | 122 | steps: |
29 | 123 | - name: Checkout XLA |
30 | | - uses: actions/checkout@v3 |
31 | | - with: |
32 | | - repository: juliagmt-google/xla |
33 | | - |
| 124 | + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 |
| 125 | + |
34 | 126 | - name: Print machine specs |
35 | 127 | run: | |
36 | | - lscpu |
| 128 | + nvidia-smi |
37 | 129 | free -h # Memory information |
38 | 130 | df -h # Disk space information |
39 | 131 | uname -a # Kernel information |
40 | | - |
41 | 132 | - name: Create results directory |
42 | | - working-directory: xla |
43 | 133 | run: mkdir results |
44 | | - |
45 | | - # - name: Wait For Connection |
46 | | - # uses: google-ml-infra/actions/ci_connection@main |
47 | | - # with: |
48 | | - # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
49 | | - |
50 | | - - name: Configure XLA |
51 | | - working-directory: xla |
| 134 | + |
| 135 | + - name: Configure XLA for GPU backend |
52 | 136 | run: | |
53 | | - cd .. |
54 | | - ls |
55 | 137 | ./configure.py --backend CUDA --nccl |
56 | | -
|
57 | 138 | - name: Set TF_CPP_MAX_VLOG_LEVEL |
58 | | - working-directory: xla |
59 | | - run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps |
| 139 | + env: |
| 140 | + TF_CPP_MAX_VLOG_LEVEL: 1 |
60 | 141 |
|
61 | 142 | - name: Build hlo_runner_main |
62 | | - working-directory: xla |
63 | 143 | run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main |
64 | | - |
65 | | - # - name: Wait For Connection |
66 | | - # uses: google-ml-infra/actions/ci_connection@main |
67 | | - # with: |
68 | | - # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
69 | | - |
70 | | - # - name: Wait For Connection |
71 | | - # uses: google-ml-infra/actions/ci_connection@main |
72 | | - # with: |
73 | | - # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
74 | | - |
75 | | - - name: Run an HLO file |
76 | | - working-directory: xla |
77 | | - run: | |
78 | | - cd .. |
79 | | - ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log |
80 | | - |
81 | | - # - name: Wait For Connection |
82 | | - # uses: google-ml-infra/actions/ci_connection@main |
83 | | - # with: |
84 | | - # halt-dispatch-input: ${{ inputs.halt-for-connection }} |
85 | | - |
86 | | - # # - name: Download parse_xla_logs.py |
87 | | - # # working-directory: xla |
88 | | - # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py |
| 144 | + # TODO(juliagmt): Add more performance-criticalHLOs to benchmark. |
| 145 | + - name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo |
| 146 | + ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo |
89 | 147 |
|
90 | | - # # - name: Parse XLA logs |
91 | | - # # working-directory: xla |
92 | | - # # run: python parse_xla_logs.py results/gpu_hlo_backend.log |
93 | | - |
94 | | - - name: Upload Results |
95 | | - uses: actions/upload-artifact@v4 |
| 148 | + - name: Upload XSpace |
| 149 | + uses: actions/upload-artifact@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 |
96 | 150 | with: |
97 | | - name: gpu-xla-benchmarks |
98 | | - path: xla/results |
99 | | - |
| 151 | + name: gpu-xla-benchmarks-xspace |
| 152 | + path: xla/results/xspace.pbtxt |
0 commit comments