Update benchmarks.yml

juliagmt-google · web-flow · commit 78cddef3411c · 2025-02-14T09:17:23.000-08:00
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -15,126 +15,6 @@ on:
         - 'yes'
         - 'no'
 
-#jobs:
-  # Tests:
-  #     strategy:
-  #       # Don't fail fast - want to see results for all builds even if one fails.
-  #       fail-fast: false
-  #       matrix:
-  #         job_info: [
-  #           {
-  #             os: "linux-x86-n2-16",
-  #             container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build:latest",
-  #             pretty_name: "XLA Linux x86 CPU",
-  #           },
-  #           {
-  #             os: "linux-arm64-c4a-16",
-  #             container: "us-central1-docker.pkg.dev/tensorflow-sigs/tensorflow/ml-build-arm64:latest",
-  #             pretty_name: "XLA Linux ARM64 CPU",
-  #           }
-  #         ]
-  #     name: ${{ matrix.job_info.pretty_name }}
-  #     runs-on: ${{ matrix.job_info.os }}
-  #     container: ${{ matrix.job_info.container }}
-  #     defaults:
-  #       run:
-  #         shell: bash
-  #     timeout-minutes: 30
-  #     steps:
-  #       - name: Print machine specs
-  #         run: |
-  #           lscpu
-  #           free -h  # Memory information
-  #           df -h    # Disk space information
-  #           uname -a # Kernel information
-  #       - name: Wait For Connection
-  #         uses: google-ml-infra/actions/ci_connection@main
-  #         with:
-  #           halt-dispatch-input: ${{ inputs.halt-for-connection }}
-  
-        # - name: Check Python Version in Container
-        #   run: python3 --version
-  
-        # - name: Install Python 3.10 if not present (IN CONTAINER)
-        #   run: |
-        #     if ! python3 --version > /dev/null 2>&1; then # check for python3
-        #       echo "Python 3 not found, installing..."
-        #       apt-get update
-        #       apt-get install -y python3.10 python3-pip
-        #     else
-        #       echo "Python 3 found."
-        #     fi
-  
-        # - name: Checkout OpenXLA
-        #   uses: actions/checkout@v3
-        #   with:
-        #     repository: openxla/xla
-  
-        # - name: Create results directory
-        #   working-directory: xla
-        #   run:
-        #     mkdir results
-  
-        # - name: Run setup.sh for E2E benchmarks flax_2b
-        #   working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
-        #   timeout-minutes: 60
-        #   run: |
-        #     bash setup.sh
-  
-        # - name: Run run.sh for E2E benchmarks flax_2b
-        #   working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
-        #   timeout-minutes: 30
-        #   run: |
-        #     bash run.sh | tee -a ../../../../../../results/flax_2b.log
-  
-        # TODO(juliagmt): Re-enable once the build is fixed.
-        # - name: Build run_hlo_module
-        #   working-directory: xla
-        #   run: bazel build -c opt --dynamic_mode=off //xla/tools:run_hlo_module
-  
-        # - name: Run HLO Module Benchmarks
-        #   working-directory: xla
-        #   continue-on-error: true
-        #   run: |
-        #     for file in tests/fuzz/*.hlo; do
-        #       filename=$(basename "$file")
-        #       # Skip expected failed hlo files.
-        #       if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
-        #         echo "Skipping benchmark on $file"
-        #         continue
-        #       fi
-        #       echo "Running benchmark on $file"
-        #       ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU "$file"
-        #     done
-  
-        # - name: Build CPU Benchmarks
-        #   working-directory: xla
-        #   run: bazel build -c opt --dynamic_mode=off //xla/backends/cpu/benchmarks:*
-  
-        # - name: Run CPU benchmarks
-        #   working-directory: xla
-        #   continue-on-error: true
-        #   run: |
-        #     find ./bazel-bin/xla/backends/cpu/benchmarks/ -maxdepth 1 -type f -executable -name "*_test" -print0 | while IFS= read -r -d $'\0' benchmark; do
-        #       benchmark_name=$(basename "$benchmark" | sed 's/_test$//') 
-        #       echo "Running benchmark: $benchmark_name"
-  
-        #       # Run the benchmark with default parameters.
-        #       $benchmark --benchmark_filter=".*"
-        #       $benchmark --benchmark_filter=".*" > "results/$benchmark_name.log" 2>&1
-  
-        #       # Check the exit code of the benchmark
-        #       if [ $? -ne 0 ]; then
-        #         echo "Error: Benchmark '$benchmark_name' failed. Check the log file: results/$benchmark_name.log"
-        #       else
-        #         echo "Benchmark '$benchmark_name' completed successfully."
-        #       fi
-        #     done
-        # - name: Upload Results ${{ matrix.build_target }}
-        #   uses: actions/upload-artifact@v4
-        #   with:
-        #     name: cpu-xla-benchmarks-${{ matrix.job_info.os }}
-        #     path: xla/results
 jobs:
   build-xla-gpu-and-test:
     runs-on: "linux-x86-g2-48-l4-4gpu" #linux-x86-n2-16 # Use a GPU-enabled runner 
@@ -146,13 +26,8 @@ jobs:
       - name: Checkout XLA
         uses: actions/checkout@v3
         with:
-          repository: openxla/xla  # Replace with your fork if needed
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          repository: juliagmt-google/xla  
-          ref: dev
-     
+          repository: openxla/xla
+
       - name: Print machine specs
         run: |
           lscpu
@@ -164,147 +39,58 @@ jobs:
         working-directory: xla
         run: mkdir results
           
-      - name: Wait For Connection
-        uses: google-ml-infra/actions/ci_connection@main
-        with:
-          halt-dispatch-input: ${{ inputs.halt-for-connection }}
-      # - name: Set up Python 3.10  # Choose your desired Python version
-      #   uses: actions/setup-python@v4
+      # - name: Wait For Connection
+      #   uses: google-ml-infra/actions/ci_connection@main
       #   with:
-      #     python-version: '3.10'
-
-  #     # - name: Create and activate virtual environment
-  #     #   shell: bash  # Force the use of bash
-  #     #   run: |
-  #     #     python -m venv xla/venv
-  #     #     source xla/venv/bin/activate
-  #     # - name: Set up Python 3.10
-  #     #   uses: actions/setup-python@v4
-  #     #   with:
-  #     #     python-version: '3.10'
-
-  #     # - name: Run setup.sh for E2E benchmarks flax_2b (within venv)
-  #     #   working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
-  #     #   shell: bash
-  #     #   run: |
-  #     #     bash setup.sh
-          
-  #     # - name: Wait For Connection
-  #     #   uses: google-ml-infra/actions/ci_connection@main
-  #     #   with:
-  #     #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
-
-  #     # - name: Run run.sh for E2E benchmarks flax_2b (within venv)
-  #     #   working-directory: xla/backends/cpu/benchmarks/e2e/gemma2/flax_2b
-  #     #   timeout-minutes: 30
-  #     #   shell: bash
-  #     #   run: |
-  #     #     bash run.sh > ../../../../../../results/flax_2b.log
-
-  #     # - name: Build run_hlo_module
-  #     #   working-directory: xla
-  #     #   run: bazel build -c opt --dynamic_mode=off //xla/tools:run_hlo_module
-
-  #     # - name: Run HLO Module Benchmarks
-  #     #   working-directory: xla
-  #     #   continue-on-error: true
-  #     #   run: |
-  #     #     for file in xla/tests/fuzz/*.hlo; do
-  #     #       filename=$(basename "$file")
-  #     #       # Skip expected failed hlo files.
-  #     #       if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
-  #     #         echo "Skipping benchmark on $file"
-  #     #         continue
-  #     #       fi
-  #     #       echo "Running benchmark on $file"
-  #     #       ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=CPU "$file"
-  #     #     done
-
-  #     # - name: Build CPU Benchmarks
-  #     #   working-directory: xla
-  #     #   run: bazel build -c opt --dynamic_mode=off //xla/backends/cpu/benchmarks:*
-
-  #     # - name: Run CPU benchmarks
-  #     #   working-directory: xla
-  #     #   continue-on-error: true
-  #     #   run: |
-  #     #     find ./bazel-bin/xla/backends/cpu/benchmarks/ -maxdepth 1 -type f -executable -name "*_test" -print0 | while IFS= read -r -d $'\0' benchmark; do
-  #     #       benchmark_name=$(basename "$benchmark" | sed 's/_test$//') 
-  #     #       echo "Running benchmark: $benchmark_name"
-
-  #     #       # Run the benchmark with default parameters.
-  #     #       $benchmark --benchmark_filter=".*"
-  #     #       $benchmark --benchmark_filter=".*" > "results/$benchmark_name.log" 2>&1
-
-  #     #       # Check the exit code of the benchmark
-  #     #       if [ $? -ne 0 ]; then
-  #     #         echo "Error: Benchmark '$benchmark_name' failed. Check the log file: results/$benchmark_name.log"
-  #     #       else
-  #     #         echo "Benchmark '$benchmark_name' completed successfully."
-  #     #       fi
-  #     #     done
-
-
-  #     # - name: Wait For Connection
-  #     #   uses: google-ml-infra/actions/ci_connection@main
-  #     #   with:
-  #     #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
-  #     # - name: Get GPU spec
-  #     #   working-directory: xla
-  #     #   continue-on-error: true
-  #     #   run: nvidia-smi
+      #     halt-dispatch-input: ${{ inputs.halt-for-connection }
         
-  #     # - name: Configure XLA
-  #     #   working-directory: xla
-  #     #   run: ./configure.py --backend CUDA --nccl
-
-  #     # - name: Set TF_CPP_MAX_VLOG_LEVEL
-  #     #   working-directory: xla
-  #     #   run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV  # Use GITHUB_ENV to persist across steps
+      - name: Configure XLA
+        working-directory: xla
+        run: ./configure.py --backend CUDA --nccl
 
-  #     # - name: Check TF_CPP_MAX_VLOG_LEVEL
-  #     #   working-directory: xla
-  #     #   run: echo "$TF_CPP_MAX_VLOG_LEVEL"
+      - name: Set TF_CPP_MAX_VLOG_LEVEL
+        working-directory: xla
+        run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV  # Use GITHUB_ENV to persist across steps
 
-  #     # - name: Build hlo_runner_main
-  #     #   working-directory: xla
-  #     #   run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
+      - name: Build hlo_runner_main
+        working-directory: xla
+        run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
         
-  #     # - name: Wait For Connection
-  #     #   uses: google-ml-infra/actions/ci_connection@main
-  #     #   with:
-  #     #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      # - name: Wait For Connection
+      #   uses: google-ml-infra/actions/ci_connection@main
+      #   with:
+      #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
 
-  #     # - name: Create gpu_hlo_backend.hlo
-  #     #   working-directory: xla
-  #     #   run: |
-          # cat << EOF > gpu_hlo_backend.hlo
-          # HloModule module
-          # // CHECK: is_scheduled=true
+      - name: Create gpu_hlo_backend.hlo
+        working-directory: xla
+        run: |
+          cat << EOF > gpu_hlo_backend.hlo
+          HloModule module
+          // CHECK: is_scheduled=true
           
-          # ENTRY computation {
-          #     p = f32[5000,6000]{1,0} parameter(0)
-          #     e = f32[5000,6000]{1,0} sqrt(p)
-          #     c = f32[6000,5000] transpose(p), dimensions={1,0}
-          #     r = f32[300,20,5000] reshape(c)
-          #     ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
-          # }
-          # EOF
+          ENTRY computation {
+              p = f32[5000,6000]{1,0} parameter(0)
+              e = f32[5000,6000]{1,0} sqrt(p)
+              c = f32[6000,5000] transpose(p), dimensions={1,0}
+              r = f32[300,20,5000] reshape(c)
+              ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
+          }
+          EOF
           
-  #     # - name: Wait For Connection
-  #     #   uses: google-ml-infra/actions/ci_connection@main
-  #     #   with:
-  #     #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      # - name: Wait For Connection
+      #   uses: google-ml-infra/actions/ci_connection@main
+      #   with:
+      #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
           
-  #     # - name: Run an HLO file
-  #     #   working-directory: xla
-  #     #   run: |
-  #     #     ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning gpu_hlo_backend.hlo  &> results/gpu_hlo_backend.log
+      - name: Run an HLO file
+        working-directory: xla
+        run: |
+          ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --log_output=True --use_spmd_partitioning --xla_gpu_dump_xspace_to=resultts/xspace.pb gpu_hlo_backend.hlo  &> results/gpu_hlo_backend.log
     
-  #     # - name: Wait For Connection
-  #     #   uses: google-ml-infra/actions/ci_connection@main
-  #     #   with:
-  #     #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      - name: Wait For Connection
+        uses: google-ml-infra/actions/ci_connection@main
+        with:
+          halt-dispatch-input: ${{ inputs.halt-for-connection }}
    
   #     # - name: Download parse_xla_logs.py
   #     #   working-directory: xla
@@ -314,57 +100,9 @@ jobs:
   #     #   working-directory: xla
   #     #   run: python parse_xla_logs.py results/gpu_hlo_backend.log
 
-  #     - name: Upload Results
-  #       uses: actions/upload-artifact@v4
-  #       with:
-  #         name: gpu-xla-benchmarks
-  #         path: xla/results
-  # # # jax-build-and-test:
-  # # #   runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
-  # # #   container:
-  # # #     image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
-
-  # # #   env:
-  # # #     JAXCI_HERMETIC_PYTHON_VERSION: 3.11
-
-  # # #   steps:
-  # # #     - name: Checkout JAX Fork
-  # # #       uses: actions/checkout@v3
-  # # #       with:
-  # # #         repository: 'google-ml-infra/jax-fork'
-  # # #         path: jax-fork
-
-  # # #     - name: Install JAX Dependencies
-  # # #       working-directory: jax-fork
-  # # #       run: |
-  # # #         python -m pip install --upgrade pip
-  # # #         pip install pytest
-  # # #         pip install absl-py
-  # # #         pip install "jax[cuda12_pip]"  # Adjust CUDA version if needed
-  # # #         pip install google-benchmark
-  # # #     - name: Run JAX Multiprocess GPU Test
-  # # #       working-directory: jax-fork
-  # # #       continue-on-error: true
-  # # #       run: python -m pytest tests/multiprocess_gpu_test.py
-      
-
-  # #     # - name: Run HLO Module Benchmarks withg GPU in xla/tests/fuzz
-  # #     #   working-directory: xla
-  # #     #   continue-on-error: true
-  # #     #   run: |
-  # #     #     for file in xla/tests/fuzz/*.hlo; do
-  # #     #       filename=$(basename "$file")
-  # #     #       # Skip expected failed hlo files.
-  # #     #       if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
-  # #     #         echo "Skipping benchmark on $file"
-  # #     #         continue
-  # #     #       fi
-  # #     #       echo "Running benchmark on $file" &> results/"$filename".log
-  # #     #       ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log
-  # #     #     done
-
-  # #     # - name: Upload Results
-  # #     #   uses: actions/upload-artifact@v4
-  # #     #   with:
-  # #     #     name: gpu-xla-benchmarks
-  # #     #     path: xla/results
+      - name: Upload Results
+        uses: actions/upload-artifact@v4
+        with:
+          name: gpu-xla-benchmarks
+          path: xla/results
+