CI - Nightly Run Benchmark on GKE #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI - Nightly Run Benchmark on GKE | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| input_dir: | |
| description: 'Input directory for benchmark results' | |
| required: false | |
| default: '/tmp/cicd/analysis' | |
| output_dir: | |
| description: 'Output directory name (S3 prefix and artifact name)' | |
| required: false | |
| default: '' | |
| # push: | |
| # branches: | |
| # - main | |
| schedule: | |
| - cron: '0 0 * * *' # Daily at midnight UTC | |
| jobs: | |
| build-image: | |
| name: Build Nightly Image | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6.0.2 | |
| - name: Build and push nightly image | |
| uses: ./.github/actions/docker-build-and-push | |
| with: | |
| tag: nightly | |
| image-name: llm-d-benchmark | |
| registry: ghcr.io/llm-d | |
| github-token: ${{ secrets.GHCR_TOKEN }} | |
| platform: linux/amd64 | |
| benchmark-standalone: | |
| name: Benchmark - Standalone (GKE) | |
| runs-on: ubuntu-latest | |
| needs: build-image | |
| timeout-minutes: 240 | |
| env: | |
| LLMDBENCH_CICD_NS: llmdbenchcicdsns-gke | |
| LLMDBENCH_CICD_R: llmdbenchcicdr-gke | |
| LLMDBENCH_CICD_TARGET: cicd/gke | |
| LLMDBENCH_CICD_METHOD: standalone | |
| LLMDBENCH_WORKSPACE: /tmp/llmdbenchcicds-gke | |
| GCP_PROJECT_ID: llm-d-scale | |
| GKE_CLUSTER_NAME: llm-d-e2e-us-east5 | |
| GKE_CLUSTER_ZONE: us-east5 | |
| GATEWAY: gke-l7-regional-external-managed | |
| GATEWAY_TYPE: gke | |
| LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }} | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6.0.2 | |
| with: | |
| fetch-depth: 0 | |
| - name: Set up Python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Authenticate to Google Cloud | |
| id: auth | |
| uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 | |
| with: | |
| credentials_json: ${{ secrets.GKE_SA_KEY }} | |
| - name: Set up gcloud CLI and kubectl | |
| uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db | |
| with: | |
| project_id: ${{ env.GCP_PROJECT_ID }} | |
| install_components: 'kubectl,gke-gcloud-auth-plugin' | |
| - name: Get GKE credentials | |
| run: | | |
| gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}" | |
| - name: Install llmdbenchmark | |
| run: | | |
| ./install.sh -y 2>&1 | |
| - name: Cleanup target cloud | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Standup | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" standup -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Debug info (on failure) | |
| if: failure() | |
| run: | | |
| echo "=== PVC status ===" | |
| kubectl get pvc -n "$NS" -o wide || true | |
| echo "" | |
| echo "=== All pods ===" | |
| kubectl get pods -n "$NS" -o wide || true | |
| echo "" | |
| echo "=== Download job logs ===" | |
| kubectl logs job/download-model -n "$NS" --tail=50 || true | |
| echo "" | |
| echo "=== Download pod logs (previous) ===" | |
| for pod in $(kubectl get pods -n "$NS" -l job-name=download-model -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl logs -n "$NS" "$pod" --tail=50 2>/dev/null || true | |
| kubectl logs -n "$NS" "$pod" --previous --tail=50 2>/dev/null || true | |
| done | |
| echo "" | |
| echo "=== Disk usage on node ===" | |
| kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: allocatable ephemeral={.status.allocatable.ephemeral-storage}, capacity={.status.capacity.ephemeral-storage}{"\n"}{end}' || true | |
| echo "" | |
| echo "=== Failed pod descriptions ===" | |
| for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl describe -n "$NS" "$pod" 2>/dev/null | tail -20 | |
| echo "--- logs ---" | |
| kubectl logs -n "$NS" "$pod" --tail=30 --all-containers 2>/dev/null || true | |
| done | |
| echo "" | |
| echo "=== Events ===" | |
| kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -20 || true | |
| - name: Run | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" run -p "$LLMDBENCH_CICD_NS" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Teardown (standalone) | |
| if: always() | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Debug info (on failure) | |
| if: failure() | |
| run: | | |
| echo "=== Pod status ===" | |
| kubectl get pods -n "$NS" -o wide || true | |
| echo "" | |
| echo "=== Pod descriptions (non-running) ===" | |
| for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl describe -n "$NS" "$pod" || true | |
| done | |
| echo "" | |
| echo "=== Pod logs (non-running) ===" | |
| for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl logs -n "$NS" "$pod" --tail=200 --all-containers || true | |
| done | |
| echo "" | |
| echo "=== Events ===" | |
| kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -30 || true | |
| - name: Install AWS CLI | |
| run: | | |
| curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" | |
| unzip awscliv2.zip >/dev/null 2>&1 | |
| sudo ./aws/install || sudo ./aws/install --update | |
| aws --version | |
| # - name: Upload results to IBM COS | |
| # env: | |
| # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| # run: | | |
| # aws configure set default.s3.signature_version s3v4 | |
| # aws s3 cp "$INPUT_DIR" "s3://${{ secrets.COS_BUCKET_NAME }}/$OUTPUT_DIR/" \ | |
| # --recursive --endpoint-url ${{ secrets.COS_ENDPOINT_URL }} || true | |
| # - name: Archive benchmark results as GitHub artifact | |
| # if: success() || failure() | |
| # uses: actions/upload-artifact@v7.0.1 | |
| # with: | |
| # name: ${{ env.OUTPUT_DIR }} | |
| # path: ${{ env.INPUT_DIR }} | |
| # retention-days: 14 | |
| benchmark-modelservice: | |
| name: Benchmark - ModelService (GKE) | |
| runs-on: ubuntu-latest | |
| needs: build-image | |
| timeout-minutes: 240 | |
| env: | |
| LLMDBENCH_CICD_NS: llmdbenchcicdmns-gke | |
| LLMDBENCH_CICD_R: llmdbenchcicdr-gke | |
| LLMDBENCH_CICD_TARGET: cicd/gke | |
| LLMDBENCH_CICD_METHOD: modelservice | |
| LLMDBENCH_WORKSPACE: /tmp/llmdbenchcicdm-gke | |
| GCP_PROJECT_ID: llm-d-scale | |
| GKE_CLUSTER_NAME: llm-d-e2e-us-east5 | |
| GKE_CLUSTER_ZONE: us-east5 | |
| GATEWAY: gke-l7-regional-external-managed | |
| GATEWAY_TYPE: gke | |
| LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }} | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@v6.0.2 | |
| with: | |
| fetch-depth: 0 | |
| - name: Set up Python | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Authenticate to Google Cloud | |
| id: auth | |
| uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 | |
| with: | |
| credentials_json: ${{ secrets.GKE_SA_KEY }} | |
| - name: Set up gcloud CLI and kubectl | |
| uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db | |
| with: | |
| project_id: ${{ env.GCP_PROJECT_ID }} | |
| install_components: 'kubectl,gke-gcloud-auth-plugin' | |
| - name: Get GKE credentials | |
| run: | | |
| gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}" | |
| - name: Install llmdbenchmark | |
| run: | | |
| ./install.sh -y 2>&1 | |
| - name: Cleanup target cloud | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Standup | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" standup -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Debug info (on failure) | |
| if: failure() | |
| run: | | |
| echo "=== PVC status ===" | |
| kubectl get pvc -n "$NS" -o wide || true | |
| echo "" | |
| echo "=== All pods ===" | |
| kubectl get pods -n "$NS" -o wide || true | |
| echo "" | |
| echo "=== Download job logs ===" | |
| kubectl logs job/download-model -n "$NS" --tail=50 || true | |
| echo "" | |
| echo "=== Download pod logs (previous) ===" | |
| for pod in $(kubectl get pods -n "$NS" -l job-name=download-model -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl logs -n "$NS" "$pod" --tail=50 2>/dev/null || true | |
| kubectl logs -n "$NS" "$pod" --previous --tail=50 2>/dev/null || true | |
| done | |
| echo "" | |
| echo "=== Disk usage on node ===" | |
| kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: allocatable ephemeral={.status.allocatable.ephemeral-storage}, capacity={.status.capacity.ephemeral-storage}{"\n"}{end}' || true | |
| echo "" | |
| echo "=== Failed pod descriptions ===" | |
| for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl describe -n "$NS" "$pod" 2>/dev/null | tail -20 | |
| echo "--- logs ---" | |
| kubectl logs -n "$NS" "$pod" --tail=30 --all-containers 2>/dev/null || true | |
| done | |
| echo "" | |
| echo "=== Events ===" | |
| kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -20 || true | |
| - name: Run | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" run -p "$LLMDBENCH_CICD_NS" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Teardown (standalone) | |
| if: always() | |
| run: | | |
| llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD" | |
| shell: bash | |
| - name: Debug info (on failure) | |
| if: failure() | |
| run: | | |
| echo "=== Pod status ===" | |
| kubectl get pods -n "$NS" -o wide || true | |
| echo "" | |
| echo "=== Pod descriptions (non-running) ===" | |
| for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl describe -n "$NS" "$pod" || true | |
| done | |
| echo "" | |
| echo "=== Pod logs (non-running) ===" | |
| for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do | |
| echo "--- $pod ---" | |
| kubectl logs -n "$NS" "$pod" --tail=200 --all-containers || true | |
| done | |
| echo "" | |
| echo "=== Events ===" | |
| kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -30 || true | |
| - name: Install AWS CLI | |
| run: | | |
| curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" | |
| unzip awscliv2.zip >/dev/null 2>&1 | |
| sudo ./aws/install || sudo ./aws/install --update | |
| aws --version | |
| # - name: Upload results to IBM COS | |
| # env: | |
| # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| # run: | | |
| # aws configure set default.s3.signature_version s3v4 | |
| # aws s3 cp "$INPUT_DIR" "s3://${{ secrets.COS_BUCKET_NAME }}/$OUTPUT_DIR/" \ | |
| # --recursive --endpoint-url ${{ secrets.COS_ENDPOINT_URL }} || true | |
| # - name: Archive benchmark results as GitHub artifact | |
| # if: success() || failure() | |
| # uses: actions/upload-artifact@v7.0.1 | |
| # with: | |
| # name: ${{ env.OUTPUT_DIR }} | |
| # path: ${{ env.INPUT_DIR }} | |
| # retention-days: 14 |