Update benchmarks.yml

juliagmt-google · web-flow · commit f772be5feba5 · 2024-12-13T13:54:11.000-08:00
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -16,7 +16,7 @@ on:
         - 'no'
 
 jobs:
-  gpu-jax-build-and-test:
+  jax-build-and-test:
     runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
     container:
       image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
@@ -39,13 +39,12 @@ jobs:
           pip install absl-py
           pip install "jax[cuda12_pip]"  # Adjust CUDA version if needed
           pip install google-benchmark
-
       - name: Run JAX Multiprocess GPU Test
         working-directory: jax-fork
         continue-on-error: true
         run: python -m pytest tests/multiprocess_gpu_test.py
 
-  gpu-xla-build-and-test:
+  build-xla-gpu-and-test:
     runs-on: linux-x86-g2-48-l4-4gpu # Use a GPU-enabled runner
     container:
       image: "gcr.io/tensorflow-testing/nosla-cuda12.3-cudnn9.1-ubuntu20.04-manylinux2014-multipython:latest"
@@ -58,20 +57,24 @@ jobs:
           repository: openxla/xla  # Replace with your fork if needed
           path: xla
       
-      # - name: Build XLA with GPU support with RBE
-      #   working-directory: xla
-      #   continue-on-error: true
-      #   run: bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
+      - name: Build XLA with GPU support with RBE
+        working-directory: xla
+        continue-on-error: true
+        run: bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @tsl//tsl/...
      
-      # - name: Run XLA tests with GPU support with RBE
-      #   working-directory: xla
-      #   continue-on-error: true
-      #   run: bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
+      - name: Run XLA tests with GPU support with RBE
+        working-directory: xla
+        continue-on-error: true
+        run: bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd --config=warnings --config=rbe_linux_cuda_nvcc --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --@cuda_driver//:enable_forward_compatibility=true --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @tsl//tsl/...
 
-      # - name: Run Profile Analysis
-      #   working-directory: xla
-      #   continue-on-error: true
-      #   run: bazel analyze-profile profile.json.gz
+      - name: Run Profile Analysis
+        working-directory: xla
+        continue-on-error: true
+        run: bazel analyze-profile profile.json.gz
+        
+      - name: Create results directory
+        working-directory: xla
+        run: mkdir results
         
       - name: Get GPU spec
         working-directory: xla
@@ -90,16 +93,15 @@ jobs:
         working-directory: xla
         run: bazel build -c opt --dynamic_mode=off xla/tools:run_hlo_module --config=cuda
 
-      - name: Wait For Connection
-        uses: google-ml-infra/actions/ci_connection@main
-        with:
-          halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      # - name: Wait For Connection
+      #   uses: google-ml-infra/actions/ci_connection@main
+      #   with:
+      #     halt-dispatch-input: ${{ inputs.halt-for-connection }}
 
       - name: Run HLO Module Benchmarks withg GPU in xla/tests/fuzz
         working-directory: xla
         continue-on-error: true
         run: |
-          mkdir -p gpu_benchmark_logs/xla_tests_fuzz
           for file in xla/tests/fuzz/*.hlo; do
             filename=$(basename "$file")
             # Skip expected failed hlo files.
@@ -108,15 +110,20 @@ jobs:
               continue
             fi
             echo "Running benchmark on $file"
-            ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$file"  &> gpu_benchmark_logs/xla_tests_fuzz/"$filename".log
+            ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$file" &> results/"$file".log
           done
-
+      - name: Check files
+        working-directory: xla
+        continue-on-error: true
+        run: |
+          ls xla/results
       - name: Run HLO Module Benchmarks withg GPU in xla/service/gpu
         working-directory: xla
         continue-on-error: true
         run: |
-          mkdir -p gpu_benchmark_logs/xla_service_gpu
-          find xla/service/gpu -name "*.hlo" -print0 | xargs -0 -I {} sh -c 'filename=$(basename "$1"); ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$1" &> gpu_benchmark_logs/xla_service_gpu/"$filename".log' sh {}
+          find xla/service/gpu -name "*.hlo" -print0 | while IFS= read -r -d $'\0' file; do
+            ./bazel-bin/xla/tools/run_hlo_module --input_format=hlo --platform=GPU "$file" &> results/"$(basename "$file")".log
+          done
       # - name: Build hlo_runner_main
       #   working-directory: xla
       #   run: bazel build -c opt --config=cuda --dynamic_mode=off //xla/tools/multihost_hlo_runner:hlo_runner_main
@@ -130,9 +137,8 @@ jobs:
       #       echo "Running benchmark on $file"
       #       bazel-bin/xla/tools/multihost_hlo_runner/hlo_runner_main "$file"
       #     done
-
       - name: Upload Results
         uses: actions/upload-artifact@v4
         with:
-          name: gpu-benchmark-logs
-          path: xla/gpu_benchmark_logs
+          name: gpu-xla-benchmarks
+          path: xla/results