Create a workflow to run benchmarks #167
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmarks | |
| on: | |
| pull_request: | |
| branches: | |
| - main | |
| workflow_dispatch: | |
| inputs: | |
| halt-for-connection: | |
| description: 'Should this workflow run wait for a remote connection?' | |
| type: choice | |
| required: true | |
| default: 'no' | |
| options: | |
| - 'yes' | |
| - 'no' | |
| jobs: | |
| Tests: | |
| strategy: | |
| # Don't fail fast - want to see results for all builds even if one fails. | |
| fail-fast: false | |
| matrix: | |
| job_info: [ | |
| { | |
| os: "linux-x86-n2-16", | |
| container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest", | |
| pretty_name: "XLA Linux x86 CPU", | |
| }, | |
| { | |
| os: "linux-arm64-c4a-16", | |
| container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest", | |
| pretty_name: "XLA Linux ARM64 CPU", | |
| } | |
| ] | |
| name: ${{ matrix.job_info.pretty_name }} | |
| runs-on: ${{ matrix.job_info.os }} | |
| container: ${{ matrix.job_info.container }} | |
| defaults: | |
| run: | |
| shell: bash | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Print machine specs | |
| run: | | |
| lscpu | |
| free -h # Memory information | |
| df -h # Disk space information | |
| uname -a # Kernel information | |
| - name: Wait For Connection | |
| uses: google-ml-infra/actions/ci_connection@main | |
| with: | |
| halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Check Python Version in Container | |
| # run: python3 --version | |
| # - name: Install Python 3.10 if not present (IN CONTAINER) | |
| # run: | | |
| # if ! python3 --version > /dev/null 2>&1; then # check for python3 | |
| # echo "Python 3 not found, installing..." | |
| # apt-get update | |
| # apt-get install -y python3.10 python3-pip | |
| # else | |
| # echo "Python 3 found." | |
| # fi | |
| - name: Checkout OpenXLA | |
| uses: actions/checkout@v3 | |
| with: | |
| repository: openxla/xla | |
| - name: Create results directory | |
| working-directory: xla | |
| run: | |
| mkdir results | |
| # - name: Run setup.sh for E2E benchmarks flax_2b | |
| # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b | |
| # timeout-minutes: 60 | |
| # run: | | |
| # bash setup.sh | |
| # - name: Run run.sh for E2E benchmarks flax_2b | |
| # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b | |
| # timeout-minutes: 30 | |
| # run: | | |
| # bash run.sh | tee -a ../../../../../../results/flax_2b.log | |
| # TODO(juliagmt): Re-enable once the build is fixed. | |
| # - name: Build run_hlo_module | |
| # working-directory: xla | |
| # run: bazel build -c opt --dynamic_mode=off //xla/tools:run_hlo_module | |
| # - name: Run HLO Module Benchmarks | |
| # working-directory: xla | |
| # continue-on-error: true | |
| # run: | | |
| # for file in tests/fuzz/*.hlo; do | |
| # filename=$(basename "$file") | |
| # # Skip expected failed hlo files. | |
| # if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then | |
| # echo "Skipping benchmark on $file" | |
| # continue | |
| # fi | |
| # echo "Running benchmark on $file" | |
| # ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU "$file" | |
| # done | |
| - name: Build CPU Benchmarks | |
| working-directory: xla | |
| run: bazel build -c opt --dynamic_mode=off //xla/backends/cpu/benchmarks:* | |
| - name: Run CPU benchmarks | |
| working-directory: xla | |
| continue-on-error: true | |
| run: | | |
| find ./bazel-bin/xla/backends/cpu/benchmarks/ -maxdepth 1 -type f -executable -name "*_test" -print0 | while IFS= read -r -d $'\0' benchmark; do | |
| benchmark_name=$(basename "$benchmark" | sed 's/_test$//') | |
| echo "Running benchmark: $benchmark_name" | |
| # Run the benchmark with default parameters. | |
| $benchmark --benchmark_filter=".*" | |
| $benchmark --benchmark_filter=".*" > "results/$benchmark_name.log" 2>&1 | |
| # Check the exit code of the benchmark | |
| if [ $? -ne 0 ]; then | |
| echo "Error: Benchmark '$benchmark_name' failed. Check the log file: results/$benchmark_name.log" | |
| else | |
| echo "Benchmark '$benchmark_name' completed successfully." | |
| fi | |
| done | |
| - name: Upload Results ${{ matrix.build_target }} | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: cpu-xla-benchmarks-${{ matrix.job_info.os }} | |
| path: xla/results | |
| # build-xla-gpu-and-test: | |
| # runs-on: "linux-x86-n2-16" #linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner | |
| # container: | |
| # image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" | |
| # options: --gpus all --privileged # Might need privileged mode, use with caution | |
| # steps: | |
| # - name: Checkout XLA | |
| # uses: actions/checkout@v3 | |
| # with: | |
| # repository: openxla/xla # Replace with your fork if needed | |
| # - name: Checkout repository | |
| # uses: actions/checkout@v3 | |
| # with: | |
| # repository: juliagmt-google/xla | |
| # ref: dev | |
| # - name: Print machine specs | |
| # run: | | |
| # lscpu | |
| # free -h # Memory information | |
| # df -h # Disk space information | |
| # uname -a # Kernel information | |
| # - name: Create results directory | |
| # working-directory: xla | |
| # run: mkdir results | |
| # - name: Wait For Connection | |
| # uses: google-ml-infra/actions/ci_connection@main | |
| # with: | |
| # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # - name: Set up Python 3.10 # Choose your desired Python version | |
| # uses: actions/setup-python@v4 | |
| # with: | |
| # python-version: '3.10' | |
| # # - name: Create and activate virtual environment | |
| # # shell: bash # Force the use of bash | |
| # # run: | | |
| # # python -m venv xla/venv | |
| # # source xla/venv/bin/activate | |
| # # - name: Set up Python 3.10 | |
| # # uses: actions/setup-python@v4 | |
| # # with: | |
| # # python-version: '3.10' | |
| # # - name: Run setup.sh for E2E benchmarks flax_2b (within venv) | |
| # # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b | |
| # # shell: bash | |
| # # run: | | |
| # # bash setup.sh | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # # - name: Run run.sh for E2E benchmarks flax_2b (within venv) | |
| # # working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b | |
| # # timeout-minutes: 30 | |
| # # shell: bash | |
| # # run: | | |
| # # bash run.sh > ../../../../../../results/flax_2b.log | |
| # # - name: Build run_hlo_module | |
| # # working-directory: xla | |
| # # run: bazel build -c opt --dynamic_mode=off //xla/tools:run_hlo_module | |
| # # - name: Run HLO Module Benchmarks | |
| # # working-directory: xla | |
| # # continue-on-error: true | |
| # # run: | | |
| # # for file in xla/tests/fuzz/*.hlo; do | |
| # # filename=$(basename "$file") | |
| # # # Skip expected failed hlo files. | |
| # # if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then | |
| # # echo "Skipping benchmark on $file" | |
| # # continue | |
| # # fi | |
| # # echo "Running benchmark on $file" | |
| # # ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU "$file" | |
| # # done | |
| # # - name: Build CPU Benchmarks | |
| # # working-directory: xla | |
| # # run: bazel build -c opt --dynamic_mode=off //xla/backends/cpu/benchmarks:* | |
| # # - name: Run CPU benchmarks | |
| # # working-directory: xla | |
| # # continue-on-error: true | |
| # # run: | | |
| # # find ./bazel-bin/xla/backends/cpu/benchmarks/ -maxdepth 1 -type f -executable -name "*_test" -print0 | while IFS= read -r -d $'\0' benchmark; do | |
| # # benchmark_name=$(basename "$benchmark" | sed 's/_test$//') | |
| # # echo "Running benchmark: $benchmark_name" | |
| # # # Run the benchmark with default parameters. | |
| # # $benchmark --benchmark_filter=".*" | |
| # # $benchmark --benchmark_filter=".*" > "results/$benchmark_name.log" 2>&1 | |
| # # # Check the exit code of the benchmark | |
| # # if [ $? -ne 0 ]; then | |
| # # echo "Error: Benchmark '$benchmark_name' failed. Check the log file: results/$benchmark_name.log" | |
| # # else | |
| # # echo "Benchmark '$benchmark_name' completed successfully." | |
| # # fi | |
| # # done | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # # - name: Get GPU spec | |
| # # working-directory: xla | |
| # # continue-on-error: true | |
| # # run: nvidia-smi | |
| # # - name: Configure XLA | |
| # # working-directory: xla | |
| # # run: ./configure.py --backend CUDA --nccl | |
| # # - name: Set TF_CPP_MAX_VLOG_LEVEL | |
| # # working-directory: xla | |
| # # run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV # Use GITHUB_ENV to persist across steps | |
| # # - name: Check TF_CPP_MAX_VLOG_LEVEL | |
| # # working-directory: xla | |
| # # run: echo "$TF_CPP_MAX_VLOG_LEVEL" | |
| # # - name: Build hlo_runner_main | |
| # # working-directory: xla | |
| # # run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # # - name: Create gpu_hlo_backend.hlo | |
| # # working-directory: xla | |
| # # run: | | |
| # cat << EOF > gpu_hlo_backend.hlo | |
| # HloModule module | |
| # // CHECK: is_scheduled=true | |
| # ENTRY computation { | |
| # p = f32[5000,6000]{1,0} parameter(0) | |
| # e = f32[5000,6000]{1,0} sqrt(p) | |
| # c = f32[6000,5000] transpose(p), dimensions={1,0} | |
| # r = f32[300,20,5000] reshape(c) | |
| # ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r) | |
| # } | |
| # EOF | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # # - name: Run an HLO file | |
| # # working-directory: xla | |
| # # run: | | |
| # # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning gpu_hlo_backend.hlo &> results/gpu_hlo_backend.log | |
| # # - name: Wait For Connection | |
| # # uses: google-ml-infra/actions/ci_connection@main | |
| # # with: | |
| # # halt-dispatch-input: ${{ inputs.halt-for-connection }} | |
| # # - name: Download parse_xla_logs.py | |
| # # working-directory: xla | |
| # # run: wget https://raw.githubusercontent.com/juliagmt-google/xla/main/.github/workflows/parse_xla_logs.py | |
| # # - name: Parse XLA logs | |
| # # working-directory: xla | |
| # # run: python parse_xla_logs.py results/gpu_hlo_backend.log | |
| # - name: Upload Results | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: gpu-xla-benchmarks | |
| # path: xla/results | |
| # # # jax-build-and-test: | |
| # # # runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner | |
| # # # container: | |
| # # # image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest" | |
| # # # env: | |
| # # # JAXCI_HERMETIC_PYTHON_VERSION: 3.11 | |
| # # # steps: | |
| # # # - name: Checkout JAX Fork | |
| # # # uses: actions/checkout@v3 | |
| # # # with: | |
| # # # repository: 'google-ml-infra/jax-fork' | |
| # # # path: jax-fork | |
| # # # - name: Install JAX Dependencies | |
| # # # working-directory: jax-fork | |
| # # # run: | | |
| # # # python -m pip install --upgrade pip | |
| # # # pip install pytest | |
| # # # pip install absl-py | |
| # # # pip install "jax[cuda12_pip]" # Adjust CUDA version if needed | |
| # # # pip install google-benchmark | |
| # # # - name: Run JAX Multiprocess GPU Test | |
| # # # working-directory: jax-fork | |
| # # # continue-on-error: true | |
| # # # run: python -m pytest tests/multiprocess_gpu_test.py | |
| # # # - name: Run HLO Module Benchmarks withg GPU in xla/tests/fuzz | |
| # # # working-directory: xla | |
| # # # continue-on-error: true | |
| # # # run: | | |
| # # # for file in xla/tests/fuzz/*.hlo; do | |
| # # # filename=$(basename "$file") | |
| # # # # Skip expected failed hlo files. | |
| # # # if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then | |
| # # # echo "Skipping benchmark on $file" | |
| # # # continue | |
| # # # fi | |
| # # # echo "Running benchmark on $file" &> results/"$filename".log | |
| # # # ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log | |
| # # # done | |
| # # # - name: Upload Results | |
| # # # uses: actions/upload-artifact@v4 | |
| # # # with: | |
| # # # name: gpu-xla-benchmarks | |
| # # # path: xla/results |