Create a workflow to run benchmarks #181
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmarks | |
| on: | |
| pull_request: | |
| branches: | |
| - main | |
| workflow_dispatch: # Allows manual triggering | |
| schedule: | |
| - cron: '0 */6 * * *' # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18) | |
| # workflow_dispatch: | |
| # inputs: | |
| # halt-for-connection: | |
| # description: 'Should this workflow run wait for a remote connection?' | |
| # type: choice | |
| # required: true | |
| # default: 'no' | |
| # options: | |
| # - 'yes' | |
| # - 'no' | |
| # jobs: | |
| # build-xla-gpu-and-test: | |
| # runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner | |
| # container: | |
| # image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" | |
| # options: --gpus all --privileged # Might need privileged mode, use with caution | |
| # steps: | |
| # - name: Checkout XLA | |
| # uses: actions/checkout@v3 | |
| # with: | |
| # repository: juliagmt-google/xla | |
| # - name: Print machine specs | |
| # run: | | |
| # lscpu | |
| # free -h # Memory information | |
| # df -h # Disk space information | |
| # uname -a # Kernel information | |
| # - name: Create results directory | |
| # working-directory: xla | |
| # run: mkdir results | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Configure XLA | |
| # working-directory: xla | |
| # run: | | |
| # cd .. | |
| # ls | |
| # ./configure.py --backend CUDA --nccl | |
| # - name: Set TF_CPP_MAX_VLOG_LEVEL | |
| # working-directory: xla | |
| # run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps | |
| # - name: Build hlo_runner_main | |
| # working-directory: xla | |
| # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Run an HLO file | |
| # working-directory: xla | |
| # run: | | |
| # cd .. | |
| # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo &> xla/results/gpu_hlo_backend.log | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # # # - name: Download parse_xla_logs.py | |
| # # # working-directory: xla | |
| # # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py | |
| # # # - name: Parse XLA logs | |
| # # # working-directory: xla | |
| # # # run: python parse_xla_logs.py results/gpu_hlo_backend.log | |
| # - name: Upload Results | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: gpu-xla-benchmarks | |
| # path: xla/results | |
| jobs: | |
| Tests: | |
| strategy: | |
| # Don't fail fast - want to see results for all builds even if one fails. | |
| fail-fast: false | |
| matrix: | |
| job_info: [ | |
| { | |
| os: "linux-x86-g2-48-l4-4gpu", | |
| container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest", | |
| pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs", | |
| }, | |
| # Expect more GPU types in the future. | |
| ] | |
| name: ${{ matrix.job_info.pretty_name }} | |
| runs-on: ${{ matrix.job_info.os }} | |
| container: ${{ matrix.job_info.container }} | |
| defaults: | |
| run: | |
| shell: bash | |
| timeout-minutes: 360 | |
| steps: | |
| - name: Checkout XLA | |
| uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | |
| - name: Print machine specs | |
| run: | | |
| nvidia-smi | |
| free -h # Memory information | |
| df -h # Disk space information | |
| uname -a # Kernel information | |
| - name: Create results directory | |
| run: mkdir results | |
| - name: Configure XLA for GPU backend | |
| run: | | |
| ./configure.py --backend CUDA --nccl | |
| - name: Set TF_CPP_MAX_VLOG_LEVEL | |
| env: | |
| TF_CPP_MAX_VLOG_LEVEL: 1 | |
| - name: Build hlo_runner_main | |
| run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main | |
| # TODO(juliagmt): Add more performance-criticalHLOs to benchmark. | |
| - name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo | |
| ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo | |
| - name: Upload XSpace | |
| uses: actions/upload-artifact@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | |
| with: | |
| name: gpu-xla-benchmarks-xspace | |
| path: xla/results/xspace.pbtxt |