Update benchmarks.yml

juliagmt-google · web-flow · commit a1550fc45202 · 2024-12-16T13:09:18.000-08:00
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -81,29 +81,126 @@ jobs:
       - name: Build hlo_runner_main
         working-directory: xla
         run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
+      name: Benchmarks
 
-      # - name: Wait For Connection
-      #   uses: google-ml-infra/actions/ci_connection@main
-      #   with:
-      #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+    inputs:
+      halt-for-connection:
+        description: 'Should this workflow run wait for a remote connection?'
+        type: choice
+        required: true
+        default: 'no'
+        options:
+        - 'yes'
+        - 'no'
+
+jobs:
+  build-xla-gpu-and-test:
+    runs-on: linux-x86-g2-48-l4-4gpu
+    container:
+      image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
+      options: --gpus all --privileged 
+
+    steps:
+      - name: Checkout XLA
+        uses: actions/checkout@v3
+        with:
+          repository: openxla/xla 
+          path: xla
+
+      - name: Create results directory
+        working-directory: xla
+        run: mkdir -p results
 
-      - name: Run HLO Module Benchmarks withg GPU in xla/tests/fuzz
+      - name: Get GPU spec
         working-directory: xla
         continue-on-error: true
+        run: nvidia-smi
+
+      - name: Configure XLA
+        working-directory: xla
+        run: ./configure.py --backend CUDA --nccl
+      
+      - name: Set TF_CPP_MAX_VLOG_LEVEL
+        working-directory: xla
+        run: echo "TF_CPP_MAX_VLOG_LEVEL=1" >> $GITHUB_ENV  # Use GITHUB_ENV to persist across steps
+
+      - name: Check TF_CPP_MAX_VLOG_LEVEL
+        working-directory: xla
+        run: echo "$TF_CPP_MAX_VLOG_LEVEL"
+
+      - name: Build hlo_runner_main
+        working-directory: xla
+        run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
+
+      - name: Create b284431534_transpose_convert_f32_s8.hlo
+        working-directory: xla
         run: |
-          for file in xla/tests/fuzz/*.hlo; do
-            filename=$(basename "$file")
-            # Skip expected failed hlo files.
-            if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
-              echo "Skipping benchmark on $file"
-              continue
-            fi
-            echo "Running benchmark on $file" &> results/"$filename".log
-            ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log
-          done
+          cat << EOF > b284431534_transpose_convert_f32_s8.hlo
+          HloModule test, entry_computation_layout={(f32[1,4,32,192,384]{4,3,2,1,0})->s8[1,4,192,384,32]{4,3,2,1,0}}
+
+          fusion {
+            param_0 = f32[1,4,32,192,384] parameter(0)
+            transpose = f32[1,4,192,384,32] transpose(param_0), dimensions={0,1,3,4,2}
+            ROOT convert = s8[1,4,192,384,32] convert(transpose)
+          }
+
+          ENTRY main {
+            param_0 = f32[1,4,32,192,384] parameter(0)
+            ROOT fusion = s8[1,4,192,384,32] fusion(param_0), kind=kInput, calls=fusion
+          }
+          EOF
+
+      - name: Run specific HLO file
+        working-directory: xla
+        run: ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning b284431534_transpose_convert_f32_s8.hlo &> results/b284431534_transpose_convert_f32_s8.hlo.log
+      
+      # - name: Run HLO Module Benchmarks with GPU in xla/tests/fuzz
+      #   working-directory: xla
+      #   continue-on-error: true
+      #   run: |
+      #     for file in xla/tests/fuzz/*.hlo; do
+      #       filename=$(basename "$file")
+      #       # Skip expected failed hlo files.
+      #       if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
+      #         echo "Skipping benchmark on $file"
+      #         continue
+      #       fi
+      #       echo "Running benchmark on $file" 
+      #       ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log
+      #     done
 
       - name: Upload Results
         uses: actions/upload-artifact@v4
         with:
           name: gpu-xla-benchmarks
           path: xla/results
+      # - name: Wait For Connection
+      #   uses: google-ml-infra/actions/ci_connection@main
+      #   with:
+      #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
+
+      # - name: Run HLO Module Benchmarks withg GPU in xla/tests/fuzz
+      #   working-directory: xla
+      #   continue-on-error: true
+      #   run: |
+      #     for file in xla/tests/fuzz/*.hlo; do
+      #       filename=$(basename "$file")
+      #       # Skip expected failed hlo files.
+      #       if [[ "$filename" == "rand_000060.hlo" || "$filename" == "rand_000067.hlo" || "$filename" == "rand_000072.hlo" ]]; then
+      #         echo "Skipping benchmark on $file"
+      #         continue
+      #       fi
+      #       echo "Running benchmark on $file" &> results/"$filename".log
+      #       ./bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main --device_type=gpu --use_spmd_partitioning "$file" &> results/"$filename".log
+      #     done
+
+      # - name: Upload Results
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: gpu-xla-benchmarks
+      #     path: xla/results