CI - Nightly Benchmark on GKE #114
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI - Nightly Benchmark on GKE | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| input_dir: | |
| description: 'Input directory for benchmark results' | |
| required: false | |
| default: '/tmp/cicd/analysis' | |
| output_dir: | |
| description: 'Output directory name' | |
| required: false | |
| default: '' | |
| # push: | |
| # branches: | |
| # - main | |
| schedule: | |
| - cron: '0 0 * * *' | |
| jobs: | |
| build-image: | |
| name: Build Nightly Image | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Build and push nightly image | |
| uses: ./.github/actions/docker-build-and-push | |
| with: | |
| tag: nightly | |
| image-name: llm-d-benchmark | |
| registry: ghcr.io/llm-d | |
| github-token: ${{ secrets.GHCR_TOKEN }} | |
| platform: linux/amd64 | |
| run-benchmark-gke: | |
| name: CI - Nightly Benchmark on GKE | |
| needs: build-image | |
| runs-on: [k8s-util] | |
| timeout-minutes: 240 | |
| env: | |
| GCP_PROJECT_ID: llm-d-scale | |
| GKE_CLUSTER_NAME: llm-d-e2e-us-east5 | |
| GKE_CLUSTER_ZONE: us-east5 | |
| GATEWAY: gke-l7-regional-external-managed | |
| GATEWAY_TYPE: gke | |
| LLMDBENCH_IMAGE_TAG: nightly | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Install Python 3.11 | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: '3.11' | |
| - name: Display OS used | |
| run: | | |
| cat /etc/*os-* | |
| shell: bash | |
| - name: Set LD_LIBRARY_PATH | |
| run: | | |
| echo "LD_LIBRARY_PATH=$(python -c 'import sys; from pathlib import Path; print(Path(sys.executable).parent.parent / "lib")'):$LD_LIBRARY_PATH" >> $GITHUB_ENV | |
| shell: bash | |
| - name: Set input and output directory environment variables | |
| run: | | |
| DEFAULT_INPUT_DIR=/tmp/cicd/analysis | |
| INPUT_DIR="${{ github.event.inputs.input_dir }}" | |
| if [ -z "$INPUT_DIR" ]; then | |
| INPUT_DIR="$DEFAULT_INPUT_DIR" | |
| fi | |
| echo "INPUT_DIR=$INPUT_DIR" >> $GITHUB_ENV | |
| if [ -z "${{ github.event.inputs.output_dir }}" ]; then | |
| timestamp=$(date -u +%Y%m%dT%H%M%SZ) | |
| echo "OUTPUT_DIR=benchmark-results-${timestamp}" >> $GITHUB_ENV | |
| echo "Using generated output dir: benchmark-results-${timestamp}" | |
| else | |
| echo "OUTPUT_DIR=${{ github.event.inputs.output_dir }}" >> $GITHUB_ENV | |
| echo "Using provided output dir: ${{ github.event.inputs.output_dir }}" | |
| fi | |
| - name: Authenticate to Google Cloud | |
| id: auth | |
| uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 | |
| with: | |
| credentials_json: ${{ secrets.GKE_SA_KEY }} | |
| - name: Set up gcloud CLI and kubectl | |
| uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db | |
| with: | |
| project_id: ${{ env.GCP_PROJECT_ID }} | |
| install_components: 'kubectl,gke-gcloud-auth-plugin' | |
| - name: Get GKE credentials | |
| run: | | |
| gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}" | |
| - name: Run install_deps.sh | |
| run: | | |
| sudo apt-get update | |
| ./setup/install_deps.sh -y | |
| shell: bash | |
| - name: Cleanup target cloud (standalone) | |
| env: | |
| LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }} | |
| run: | | |
| ./setup/teardown.sh -c gke_H100_fb -t standalone -d | |
| shell: bash | |
| - name: Standup target cloud (standalone) | |
| env: | |
| LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }} | |
| run: | | |
| ./setup/standup.sh -c gke_H100_fb -t standalone | |
| shell: bash | |
| - name: Run benchmark (standalone, inference-perf) | |
| env: | |
| LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }} | |
| run: ./setup/run.sh -c gke_H100_fb -t standalone | |
| shell: bash | |
| - name: Cleanup target cloud (standalone) | |
| env: | |
| LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }} | |
| run: ./setup/teardown.sh -c gke_H100_fb -t standalone -d | |
| shell: bash | |
| - name: Collect failure diagnostics | |
| if: failure() | |
| run: | | |
| echo "=== Pod status ===" | |
| kubectl get pods -n llmdbenchcicd -o wide || true | |
| echo "" | |
| echo "=== Describe failed pods ===" | |
| for pod in $(kubectl get pods -n llmdbenchcicd --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl describe -n llmdbenchcicd "$pod" || true | |
| done | |
| echo "" | |
| echo "=== Pod logs (crashed/errored) ===" | |
| for pod in $(kubectl get pods -n llmdbenchcicd --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod logs ---" | |
| kubectl logs -n llmdbenchcicd "$pod" --tail=200 --all-containers || true | |
| echo "--- $pod previous logs ---" | |
| kubectl logs -n llmdbenchcicd "$pod" --previous --tail=200 --all-containers 2>/dev/null || true | |
| done | |
| echo "" | |
| echo "=== Recent events ===" | |
| kubectl get events -n llmdbenchcicd --sort-by='.lastTimestamp' | tail -30 || true | |
| shell: bash | |
| - name: Archive benchmark results as GitHub artifact | |
| if: success() || failure() | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: ${{ env.OUTPUT_DIR }} | |
| path: ${{ env.INPUT_DIR }} | |
| retention-days: 14 |