Create a workflow to run benchmarks #231
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmarks | |
| on: | |
| pull_request: | |
| branches: | |
| - main | |
| # workflow_dispatch: # Allows manual triggering | |
| # schedule: | |
| # - cron: '0 */6 * * *' # Run every 6 hours (at minute 0 of hours 0, 6, 12, 18) | |
| workflow_dispatch: | |
| inputs: | |
| halt-for-connection: | |
| description: 'Should this workflow run wait for a remote connection?' | |
| type: choice | |
| required: true | |
| default: 'no' | |
| options: | |
| - 'yes' | |
| - 'no' | |
| jobs: | |
| build-xla-gpu-and-test: | |
| runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner | |
| container: | |
| image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" | |
| options: --gpus all --privileged # Might need privileged mode, use with caution | |
| steps: | |
| - name: Checkout XLA | |
| uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | |
| with: | |
| repository: openxla/xla | |
| # - name: Checkout repository | |
| # uses: actions/checkout@v3 | |
| # with: | |
| # repository: juliagmt-google/xla | |
| # path: xla | |
| # - name: Print machine specs | |
| # run: | | |
| # nvidia-smi | |
| # free -h # Memory information | |
| # df -h # Disk space information | |
| # uname -a # Kernel information | |
| # - name: Set WORKSPACE_DIR | |
| # env: | |
| # WORKSPACE_DIR: ${{ github.workspace }} | |
| # run: | | |
| # echo "WORKSPACE_DIR is: $WORKSPACE_DIR" | |
| # - name: Create results directory | |
| # run: | | |
| # mkdir -p results | |
| # ls | |
| # - name: Configure XLA for GPU backend | |
| # run: | | |
| # cd xla | |
| # ./configure.py --backend CUDA --nccl | |
| # - name: Set TF_CPP_MAX_VLOG_LEVEL | |
| # env: | |
| # TF_CPP_MAX_VLOG_LEVEL: 1 | |
| # run: | | |
| # echo "TF_CPP_MAX_VLOG_LEVEL is: $TF_CPP_MAX_VLOG_LEVEL" | |
| - name: Wait For Connection | |
| uses: google-ml-infra/actions/ci_connection@main | |
| with: | |
| halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Build hlo_runner_main | |
| # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main | |
| # # TODO(juliagmt): Add more performance-critical HLOs to benchmark. | |
| # - name: Run hlo_opt and generate xspace.pb | |
| # run: | | |
| # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=$WORKSPACE_DIR/results/xspace.pb $WORKSPACE_DIR/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo | |
| # - name: Set XSPACE_PATH | |
| # env: | |
| # XSPACE_PATH: ${{ github.workspace }}/results/xspace.pb | |
| # run: | | |
| # echo "XSPACE_PATH is: $XSPACE_PATH" | |
| # - name: Compute the cost of gpu_hlo_pass.hlo | |
| # run: | | |
| # bazel run //xla/tools:compute_cost -- --input=$WORKSPACE_DIR/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo --format=hlo --gpu | |
| # - name: Checkout juliagmt-google/xla | |
| # uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | |
| # with: | |
| # repository: juliagmt-google/xla | |
| # path: juliagmt-google-xla | |
| # - name: Compute the device stats of gpu_hlo_pass.hlo | |
| # run: | | |
| # echo "XSPACE_PATH is: $XSPACE_PATH" | |
| # bazel run //xla/tools:get_device_stats_main -- --input=$XSPACE_PATH | |
| # working-directory: juliagmt-google-xla | |
| # - name: Upload XSpace | |
| # uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 | |
| # with: | |
| # name: gpu-xla-benchmarks-xspace | |
| # path: $WORKSPACE_DIR/results | |
| # - name: Print machine specs | |
| # run: | | |
| # lscpu | |
| # free -h # Memory information | |
| # df -h # Disk space information | |
| # uname -a # Kernel information | |
| # - name: Create results directory | |
| # working-directory: xla | |
| # run: mkdir results | |
| # - name: Wait For Connection | |
| # uses: google-ml-infra/actions/ci_connection@main | |
| # with: | |
| # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Configure XLA | |
| # working-directory: xla | |
| # run: | | |
| # cd .. | |
| # ls | |
| # ./configure.py --backend CUDA --nccl | |
| # - name: Set TF_CPP_MAX_VLOG_LEVEL | |
| # working-directory: xla | |
| # run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps | |
| # - name: Build hlo_runner_main | |
| # working-directory: xla | |
| # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main | |
| # - name: Wait For Connection | |
| # uses: google-ml-infra/actions/ci_connection@main | |
| # with: | |
| # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Wait For Connection | |
| # uses: google-ml-infra/actions/ci_connection@main | |
| # with: | |
| # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Run an HLO file | |
| # working-directory: xla | |
| # run: | | |
| # cd .. | |
| # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pb xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo | |
| # - name: Get Device Stats | |
| # working-directory: xla | |
| # run: | | |
| # cd .. | |
| # PWD=$(pwd) | |
| # bazel run //xla/tools:get_device_stats_main -- --input=$PWD/xla/results/xspace.pb | |
| # - name: Wait For Connection | |
| # uses: google-ml-infra/actions/ci_connection@main | |
| # with: | |
| # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Download parse_xla_logs.py | |
| # working-directory: xla | |
| # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py | |
| # - name: Parse XLA logs | |
| # working-directory: xla | |
| # run: python parse_xla_logs.py results/gpu_hlo_backend.log | |
| # - name: Upload Results | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: gpu-xla-benchmarks | |
| # path: xla/results | |
| # jobs: | |
| # Tests: | |
| # strategy: | |
| # # Don't fail fast - want to see results for all builds even if one fails. | |
| # fail-fast: false | |
| # matrix: | |
| # job_info: | |
| # - os: "linux-x86-g2-48-l4-4gpu" | |
| # container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest", | |
| # pretty_name: "Linux X86 runner with 4 NVIDIA L4 GPUs" | |
| # # Expect more GPU types in the future. | |
| # name: ${{ matrix.job_info.pretty_name }} | |
| # runs-on: ${{ matrix.job_info.os }} | |
| # container: ${{ matrix.job_info.container }} | |
| # defaults: | |
| # run: | |
| # shell: bash | |
| # timeout-minutes: 360 | |
| # steps: | |
| # - name: Checkout XLA | |
| # uses: actions/checkout@v4 # v4.1.1 | |
| # with: | |
| # repository: openxla/xla | |
| # - name: Wait For Connection | |
| # uses: google-ml-infra/actions/ci_connection@main | |
| # with: | |
| # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Print machine specs | |
| # run: | | |
| # nvidia-smi | |
| # free -h # Memory information | |
| # df -h # Disk space information | |
| # uname -a # Kernel information | |
| # - name: Create results directory | |
| # run: mkdir results | |
| # - name: Configure XLA for GPU backend | |
| # run: ./configure.py --backend CUDA --nccl | |
| # - name: Set TF_CPP_MAX_VLOG_LEVEL | |
| # env: | |
| # TF_CPP_MAX_VLOG_LEVEL: 1 | |
| # run: | | |
| # echo "TF_CPP_MAX_VLOG_LEVEL is: $TF_CPP_MAX_VLOG_LEVEL" | |
| # - name: Build hlo_runner_main | |
| # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main | |
| # # TODO(juliagmt): Add more performance-criticalHLOs to benchmark. | |
| # - name: Run xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo | |
| # run: ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=xla/results/xspace.pbtxt xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo | |
| # - name: Upload XSpace | |
| # uses: actions/upload-artifact@v4 # v4.1.1 | |
| # with: | |
| # name: gpu-xla-benchmarks-xspace | |
| # path: xla/results/xspace.pbtxt |