Skip to content

CI - Nightly Benchmark on GKE #114

CI - Nightly Benchmark on GKE

CI - Nightly Benchmark on GKE #114

name: CI - Nightly Benchmark on GKE
on:
workflow_dispatch:
inputs:
input_dir:
description: 'Input directory for benchmark results'
required: false
default: '/tmp/cicd/analysis'
output_dir:
description: 'Output directory name'
required: false
default: ''
# push:
# branches:
# - main
schedule:
- cron: '0 0 * * *'
jobs:
build-image:
name: Build Nightly Image
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Build and push nightly image
uses: ./.github/actions/docker-build-and-push
with:
tag: nightly
image-name: llm-d-benchmark
registry: ghcr.io/llm-d
github-token: ${{ secrets.GHCR_TOKEN }}
platform: linux/amd64
run-benchmark-gke:
name: CI - Nightly Benchmark on GKE
needs: build-image
runs-on: [k8s-util]
timeout-minutes: 240
env:
GCP_PROJECT_ID: llm-d-scale
GKE_CLUSTER_NAME: llm-d-e2e-us-east5
GKE_CLUSTER_ZONE: us-east5
GATEWAY: gke-l7-regional-external-managed
GATEWAY_TYPE: gke
LLMDBENCH_IMAGE_TAG: nightly
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Install Python 3.11
uses: actions/setup-python@v6
with:
python-version: '3.11'
- name: Display OS used
run: |
cat /etc/*os-*
shell: bash
- name: Set LD_LIBRARY_PATH
run: |
echo "LD_LIBRARY_PATH=$(python -c 'import sys; from pathlib import Path; print(Path(sys.executable).parent.parent / "lib")'):$LD_LIBRARY_PATH" >> $GITHUB_ENV
shell: bash
- name: Set input and output directory environment variables
run: |
DEFAULT_INPUT_DIR=/tmp/cicd/analysis
INPUT_DIR="${{ github.event.inputs.input_dir }}"
if [ -z "$INPUT_DIR" ]; then
INPUT_DIR="$DEFAULT_INPUT_DIR"
fi
echo "INPUT_DIR=$INPUT_DIR" >> $GITHUB_ENV
if [ -z "${{ github.event.inputs.output_dir }}" ]; then
timestamp=$(date -u +%Y%m%dT%H%M%SZ)
echo "OUTPUT_DIR=benchmark-results-${timestamp}" >> $GITHUB_ENV
echo "Using generated output dir: benchmark-results-${timestamp}"
else
echo "OUTPUT_DIR=${{ github.event.inputs.output_dir }}" >> $GITHUB_ENV
echo "Using provided output dir: ${{ github.event.inputs.output_dir }}"
fi
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093
with:
credentials_json: ${{ secrets.GKE_SA_KEY }}
- name: Set up gcloud CLI and kubectl
uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db
with:
project_id: ${{ env.GCP_PROJECT_ID }}
install_components: 'kubectl,gke-gcloud-auth-plugin'
- name: Get GKE credentials
run: |
gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
- name: Run install_deps.sh
run: |
sudo apt-get update
./setup/install_deps.sh -y
shell: bash
- name: Cleanup target cloud (standalone)
env:
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
run: |
./setup/teardown.sh -c gke_H100_fb -t standalone -d
shell: bash
- name: Standup target cloud (standalone)
env:
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
run: |
./setup/standup.sh -c gke_H100_fb -t standalone
shell: bash
- name: Run benchmark (standalone, inference-perf)
env:
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
run: ./setup/run.sh -c gke_H100_fb -t standalone
shell: bash
- name: Cleanup target cloud (standalone)
env:
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
run: ./setup/teardown.sh -c gke_H100_fb -t standalone -d
shell: bash
- name: Collect failure diagnostics
if: failure()
run: |
echo "=== Pod status ==="
kubectl get pods -n llmdbenchcicd -o wide || true
echo ""
echo "=== Describe failed pods ==="
for pod in $(kubectl get pods -n llmdbenchcicd --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl describe -n llmdbenchcicd "$pod" || true
done
echo ""
echo "=== Pod logs (crashed/errored) ==="
for pod in $(kubectl get pods -n llmdbenchcicd --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod logs ---"
kubectl logs -n llmdbenchcicd "$pod" --tail=200 --all-containers || true
echo "--- $pod previous logs ---"
kubectl logs -n llmdbenchcicd "$pod" --previous --tail=200 --all-containers 2>/dev/null || true
done
echo ""
echo "=== Recent events ==="
kubectl get events -n llmdbenchcicd --sort-by='.lastTimestamp' | tail -30 || true
shell: bash
- name: Archive benchmark results as GitHub artifact
if: success() || failure()
uses: actions/upload-artifact@v6
with:
name: ${{ env.OUTPUT_DIR }}
path: ${{ env.INPUT_DIR }}
retention-days: 14