Skip to content

CI - Nightly Run Benchmark on GKE #1

CI - Nightly Run Benchmark on GKE

CI - Nightly Run Benchmark on GKE #1

name: CI - Nightly Run Benchmark on GKE
on:
workflow_dispatch:
inputs:
input_dir:
description: 'Input directory for benchmark results'
required: false
default: '/tmp/cicd/analysis'
output_dir:
description: 'Output directory name (S3 prefix and artifact name)'
required: false
default: ''
# push:
# branches:
# - main
schedule:
- cron: '0 0 * * *' # Daily at midnight UTC
jobs:
build-image:
name: Build Nightly Image
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout code
uses: actions/checkout@v6.0.2
- name: Build and push nightly image
uses: ./.github/actions/docker-build-and-push
with:
tag: nightly
image-name: llm-d-benchmark
registry: ghcr.io/llm-d
github-token: ${{ secrets.GHCR_TOKEN }}
platform: linux/amd64
benchmark-standalone:
name: Benchmark - Standalone (GKE)
runs-on: ubuntu-latest
needs: build-image
timeout-minutes: 240
env:
LLMDBENCH_CICD_NS: llmdbenchcicdsns-gke
LLMDBENCH_CICD_R: llmdbenchcicdr-gke
LLMDBENCH_CICD_TARGET: cicd/gke
LLMDBENCH_CICD_METHOD: standalone
LLMDBENCH_WORKSPACE: /tmp/llmdbenchcicds-gke
GCP_PROJECT_ID: llm-d-scale
GKE_CLUSTER_NAME: llm-d-e2e-us-east5
GKE_CLUSTER_ZONE: us-east5
GATEWAY: gke-l7-regional-external-managed
GATEWAY_TYPE: gke
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
steps:
- name: Checkout Code
uses: actions/checkout@v6.0.2
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.12"
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093
with:
credentials_json: ${{ secrets.GKE_SA_KEY }}
- name: Set up gcloud CLI and kubectl
uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db
with:
project_id: ${{ env.GCP_PROJECT_ID }}
install_components: 'kubectl,gke-gcloud-auth-plugin'
- name: Get GKE credentials
run: |
gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
- name: Install llmdbenchmark
run: |
./install.sh -y 2>&1
- name: Cleanup target cloud
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Standup
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" standup -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Debug info (on failure)
if: failure()
run: |
echo "=== PVC status ==="
kubectl get pvc -n "$NS" -o wide || true
echo ""
echo "=== All pods ==="
kubectl get pods -n "$NS" -o wide || true
echo ""
echo "=== Download job logs ==="
kubectl logs job/download-model -n "$NS" --tail=50 || true
echo ""
echo "=== Download pod logs (previous) ==="
for pod in $(kubectl get pods -n "$NS" -l job-name=download-model -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl logs -n "$NS" "$pod" --tail=50 2>/dev/null || true
kubectl logs -n "$NS" "$pod" --previous --tail=50 2>/dev/null || true
done
echo ""
echo "=== Disk usage on node ==="
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: allocatable ephemeral={.status.allocatable.ephemeral-storage}, capacity={.status.capacity.ephemeral-storage}{"\n"}{end}' || true
echo ""
echo "=== Failed pod descriptions ==="
for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl describe -n "$NS" "$pod" 2>/dev/null | tail -20
echo "--- logs ---"
kubectl logs -n "$NS" "$pod" --tail=30 --all-containers 2>/dev/null || true
done
echo ""
echo "=== Events ==="
kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -20 || true
- name: Run
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" run -p "$LLMDBENCH_CICD_NS" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Teardown (standalone)
if: always()
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Debug info (on failure)
if: failure()
run: |
echo "=== Pod status ==="
kubectl get pods -n "$NS" -o wide || true
echo ""
echo "=== Pod descriptions (non-running) ==="
for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl describe -n "$NS" "$pod" || true
done
echo ""
echo "=== Pod logs (non-running) ==="
for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl logs -n "$NS" "$pod" --tail=200 --all-containers || true
done
echo ""
echo "=== Events ==="
kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -30 || true
- name: Install AWS CLI
run: |
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip >/dev/null 2>&1
sudo ./aws/install || sudo ./aws/install --update
aws --version
# - name: Upload results to IBM COS
# env:
# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# run: |
# aws configure set default.s3.signature_version s3v4
# aws s3 cp "$INPUT_DIR" "s3://${{ secrets.COS_BUCKET_NAME }}/$OUTPUT_DIR/" \
# --recursive --endpoint-url ${{ secrets.COS_ENDPOINT_URL }} || true
# - name: Archive benchmark results as GitHub artifact
# if: success() || failure()
# uses: actions/upload-artifact@v7.0.1
# with:
# name: ${{ env.OUTPUT_DIR }}
# path: ${{ env.INPUT_DIR }}
# retention-days: 14
benchmark-modelservice:
name: Benchmark - ModelService (GKE)
runs-on: ubuntu-latest
needs: build-image
timeout-minutes: 240
env:
LLMDBENCH_CICD_NS: llmdbenchcicdmns-gke
LLMDBENCH_CICD_R: llmdbenchcicdr-gke
LLMDBENCH_CICD_TARGET: cicd/gke
LLMDBENCH_CICD_METHOD: modelservice
LLMDBENCH_WORKSPACE: /tmp/llmdbenchcicdm-gke
GCP_PROJECT_ID: llm-d-scale
GKE_CLUSTER_NAME: llm-d-e2e-us-east5
GKE_CLUSTER_ZONE: us-east5
GATEWAY: gke-l7-regional-external-managed
GATEWAY_TYPE: gke
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
steps:
- name: Checkout Code
uses: actions/checkout@v6.0.2
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.12"
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093
with:
credentials_json: ${{ secrets.GKE_SA_KEY }}
- name: Set up gcloud CLI and kubectl
uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db
with:
project_id: ${{ env.GCP_PROJECT_ID }}
install_components: 'kubectl,gke-gcloud-auth-plugin'
- name: Get GKE credentials
run: |
gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
- name: Install llmdbenchmark
run: |
./install.sh -y 2>&1
- name: Cleanup target cloud
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Standup
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" standup -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Debug info (on failure)
if: failure()
run: |
echo "=== PVC status ==="
kubectl get pvc -n "$NS" -o wide || true
echo ""
echo "=== All pods ==="
kubectl get pods -n "$NS" -o wide || true
echo ""
echo "=== Download job logs ==="
kubectl logs job/download-model -n "$NS" --tail=50 || true
echo ""
echo "=== Download pod logs (previous) ==="
for pod in $(kubectl get pods -n "$NS" -l job-name=download-model -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl logs -n "$NS" "$pod" --tail=50 2>/dev/null || true
kubectl logs -n "$NS" "$pod" --previous --tail=50 2>/dev/null || true
done
echo ""
echo "=== Disk usage on node ==="
kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: allocatable ephemeral={.status.allocatable.ephemeral-storage}, capacity={.status.capacity.ephemeral-storage}{"\n"}{end}' || true
echo ""
echo "=== Failed pod descriptions ==="
for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl describe -n "$NS" "$pod" 2>/dev/null | tail -20
echo "--- logs ---"
kubectl logs -n "$NS" "$pod" --tail=30 --all-containers 2>/dev/null || true
done
echo ""
echo "=== Events ==="
kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -20 || true
- name: Run
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" run -p "$LLMDBENCH_CICD_NS" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Teardown (standalone)
if: always()
run: |
llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
shell: bash
- name: Debug info (on failure)
if: failure()
run: |
echo "=== Pod status ==="
kubectl get pods -n "$NS" -o wide || true
echo ""
echo "=== Pod descriptions (non-running) ==="
for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl describe -n "$NS" "$pod" || true
done
echo ""
echo "=== Pod logs (non-running) ==="
for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
echo "--- $pod ---"
kubectl logs -n "$NS" "$pod" --tail=200 --all-containers || true
done
echo ""
echo "=== Events ==="
kubectl get events -n "$NS" --sort-by='.lastTimestamp' | tail -30 || true
- name: Install AWS CLI
run: |
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip >/dev/null 2>&1
sudo ./aws/install || sudo ./aws/install --update
aws --version
# - name: Upload results to IBM COS
# env:
# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
# run: |
# aws configure set default.s3.signature_version s3v4
# aws s3 cp "$INPUT_DIR" "s3://${{ secrets.COS_BUCKET_NAME }}/$OUTPUT_DIR/" \
# --recursive --endpoint-url ${{ secrets.COS_ENDPOINT_URL }} || true
# - name: Archive benchmark results as GitHub artifact
# if: success() || failure()
# uses: actions/upload-artifact@v7.0.1
# with:
# name: ${{ env.OUTPUT_DIR }}
# path: ${{ env.INPUT_DIR }}
# retention-days: 14