CI - Nightly Run Benchmark on GKE #1

Workflow file for this run

.github/workflows/ci-nightly-benchmark-gke.yaml at ec2a764

	name: CI - Nightly Run Benchmark on GKE

	on:
	workflow_dispatch:
	inputs:
	input_dir:
	description: 'Input directory for benchmark results'
	required: false
	default: '/tmp/cicd/analysis'
	output_dir:
	description: 'Output directory name (S3 prefix and artifact name)'
	required: false
	default: ''

	# push:
	# branches:
	# - main

	schedule:
	- cron: '0 0 * * *' # Daily at midnight UTC

	jobs:
	build-image:
	name: Build Nightly Image
	runs-on: ubuntu-latest
	timeout-minutes: 30
	steps:
	- name: Checkout code
	uses: actions/checkout@v6.0.2

	- name: Build and push nightly image
	uses: ./.github/actions/docker-build-and-push
	with:
	tag: nightly
	image-name: llm-d-benchmark
	registry: ghcr.io/llm-d
	github-token: ${{ secrets.GHCR_TOKEN }}
	platform: linux/amd64

	benchmark-standalone:
	name: Benchmark - Standalone (GKE)
	runs-on: ubuntu-latest
	needs: build-image
	timeout-minutes: 240

	env:
	LLMDBENCH_CICD_NS: llmdbenchcicdsns-gke
	LLMDBENCH_CICD_R: llmdbenchcicdr-gke
	LLMDBENCH_CICD_TARGET: cicd/gke
	LLMDBENCH_CICD_METHOD: standalone
	LLMDBENCH_WORKSPACE: /tmp/llmdbenchcicds-gke
	GCP_PROJECT_ID: llm-d-scale
	GKE_CLUSTER_NAME: llm-d-e2e-us-east5
	GKE_CLUSTER_ZONE: us-east5
	GATEWAY: gke-l7-regional-external-managed
	GATEWAY_TYPE: gke
	LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}

	steps:
	- name: Checkout Code
	uses: actions/checkout@v6.0.2
	with:
	fetch-depth: 0

	- name: Set up Python
	uses: actions/setup-python@v6
	with:
	python-version: "3.12"

	- name: Authenticate to Google Cloud
	id: auth
	uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093
	with:
	credentials_json: ${{ secrets.GKE_SA_KEY }}

	- name: Set up gcloud CLI and kubectl
	uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db
	with:
	project_id: ${{ env.GCP_PROJECT_ID }}
	install_components: 'kubectl,gke-gcloud-auth-plugin'

	- name: Get GKE credentials
	run: \|
	gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"

	- name: Install llmdbenchmark
	run: \|
	./install.sh -y 2>&1

	- name: Cleanup target cloud
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Standup
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" standup -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Debug info (on failure)
	if: failure()
	run: \|
	echo "=== PVC status ==="
	kubectl get pvc -n "$NS" -o wide \|\| true
	echo ""
	echo "=== All pods ==="
	kubectl get pods -n "$NS" -o wide \|\| true
	echo ""
	echo "=== Download job logs ==="
	kubectl logs job/download-model -n "$NS" --tail=50 \|\| true
	echo ""
	echo "=== Download pod logs (previous) ==="
	for pod in $(kubectl get pods -n "$NS" -l job-name=download-model -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl logs -n "$NS" "$pod" --tail=50 2>/dev/null \|\| true
	kubectl logs -n "$NS" "$pod" --previous --tail=50 2>/dev/null \|\| true
	done
	echo ""
	echo "=== Disk usage on node ==="
	kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: allocatable ephemeral={.status.allocatable.ephemeral-storage}, capacity={.status.capacity.ephemeral-storage}{"\n"}{end}' \|\| true
	echo ""
	echo "=== Failed pod descriptions ==="
	for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl describe -n "$NS" "$pod" 2>/dev/null \| tail -20
	echo "--- logs ---"
	kubectl logs -n "$NS" "$pod" --tail=30 --all-containers 2>/dev/null \|\| true
	done
	echo ""
	echo "=== Events ==="
	kubectl get events -n "$NS" --sort-by='.lastTimestamp' \| tail -20 \|\| true

	- name: Run
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" run -p "$LLMDBENCH_CICD_NS" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Teardown (standalone)
	if: always()
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Debug info (on failure)
	if: failure()
	run: \|
	echo "=== Pod status ==="
	kubectl get pods -n "$NS" -o wide \|\| true
	echo ""
	echo "=== Pod descriptions (non-running) ==="
	for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl describe -n "$NS" "$pod" \|\| true
	done
	echo ""
	echo "=== Pod logs (non-running) ==="
	for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl logs -n "$NS" "$pod" --tail=200 --all-containers \|\| true
	done
	echo ""
	echo "=== Events ==="
	kubectl get events -n "$NS" --sort-by='.lastTimestamp' \| tail -30 \|\| true

	- name: Install AWS CLI
	run: \|
	curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
	unzip awscliv2.zip >/dev/null 2>&1
	sudo ./aws/install \|\| sudo ./aws/install --update
	aws --version

	# - name: Upload results to IBM COS
	# env:
	# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
	# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	# run: \|
	# aws configure set default.s3.signature_version s3v4
	# aws s3 cp "$INPUT_DIR" "s3://${{ secrets.COS_BUCKET_NAME }}/$OUTPUT_DIR/" \
	# --recursive --endpoint-url ${{ secrets.COS_ENDPOINT_URL }} \|\| true

	# - name: Archive benchmark results as GitHub artifact
	# if: success() \|\| failure()
	# uses: actions/upload-artifact@v7.0.1
	# with:
	# name: ${{ env.OUTPUT_DIR }}
	# path: ${{ env.INPUT_DIR }}
	# retention-days: 14

	benchmark-modelservice:
	name: Benchmark - ModelService (GKE)
	runs-on: ubuntu-latest
	needs: build-image
	timeout-minutes: 240

	env:
	LLMDBENCH_CICD_NS: llmdbenchcicdmns-gke
	LLMDBENCH_CICD_R: llmdbenchcicdr-gke
	LLMDBENCH_CICD_TARGET: cicd/gke
	LLMDBENCH_CICD_METHOD: modelservice
	LLMDBENCH_WORKSPACE: /tmp/llmdbenchcicdm-gke
	GCP_PROJECT_ID: llm-d-scale
	GKE_CLUSTER_NAME: llm-d-e2e-us-east5
	GKE_CLUSTER_ZONE: us-east5
	GATEWAY: gke-l7-regional-external-managed
	GATEWAY_TYPE: gke
	LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}

	steps:
	- name: Checkout Code
	uses: actions/checkout@v6.0.2
	with:
	fetch-depth: 0

	- name: Set up Python
	uses: actions/setup-python@v6
	with:
	python-version: "3.12"

	- name: Authenticate to Google Cloud
	id: auth
	uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093
	with:
	credentials_json: ${{ secrets.GKE_SA_KEY }}

	- name: Set up gcloud CLI and kubectl
	uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db
	with:
	project_id: ${{ env.GCP_PROJECT_ID }}
	install_components: 'kubectl,gke-gcloud-auth-plugin'

	- name: Get GKE credentials
	run: \|
	gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"

	- name: Install llmdbenchmark
	run: \|
	./install.sh -y 2>&1

	- name: Cleanup target cloud
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Standup
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" standup -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Debug info (on failure)
	if: failure()
	run: \|
	echo "=== PVC status ==="
	kubectl get pvc -n "$NS" -o wide \|\| true
	echo ""
	echo "=== All pods ==="
	kubectl get pods -n "$NS" -o wide \|\| true
	echo ""
	echo "=== Download job logs ==="
	kubectl logs job/download-model -n "$NS" --tail=50 \|\| true
	echo ""
	echo "=== Download pod logs (previous) ==="
	for pod in $(kubectl get pods -n "$NS" -l job-name=download-model -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl logs -n "$NS" "$pod" --tail=50 2>/dev/null \|\| true
	kubectl logs -n "$NS" "$pod" --previous --tail=50 2>/dev/null \|\| true
	done
	echo ""
	echo "=== Disk usage on node ==="
	kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}: allocatable ephemeral={.status.allocatable.ephemeral-storage}, capacity={.status.capacity.ephemeral-storage}{"\n"}{end}' \|\| true
	echo ""
	echo "=== Failed pod descriptions ==="
	for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl describe -n "$NS" "$pod" 2>/dev/null \| tail -20
	echo "--- logs ---"
	kubectl logs -n "$NS" "$pod" --tail=30 --all-containers 2>/dev/null \|\| true
	done
	echo ""
	echo "=== Events ==="
	kubectl get events -n "$NS" --sort-by='.lastTimestamp' \| tail -20 \|\| true

	- name: Run
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" run -p "$LLMDBENCH_CICD_NS" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Teardown (standalone)
	if: always()
	run: \|
	llmdbenchmark --spec "$LLMDBENCH_CICD_TARGET" teardown -p "$LLMDBENCH_CICD_NS" -r "$LLMDBENCH_CICD_R" -t "$LLMDBENCH_CICD_METHOD"
	shell: bash

	- name: Debug info (on failure)
	if: failure()
	run: \|
	echo "=== Pod status ==="
	kubectl get pods -n "$NS" -o wide \|\| true
	echo ""
	echo "=== Pod descriptions (non-running) ==="
	for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl describe -n "$NS" "$pod" \|\| true
	done
	echo ""
	echo "=== Pod logs (non-running) ==="
	for pod in $(kubectl get pods -n "$NS" --field-selector=status.phase!=Running,status.phase!=Succeeded -o name 2>/dev/null); do
	echo "--- $pod ---"
	kubectl logs -n "$NS" "$pod" --tail=200 --all-containers \|\| true
	done
	echo ""
	echo "=== Events ==="
	kubectl get events -n "$NS" --sort-by='.lastTimestamp' \| tail -30 \|\| true

	- name: Install AWS CLI
	run: \|
	curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
	unzip awscliv2.zip >/dev/null 2>&1
	sudo ./aws/install \|\| sudo ./aws/install --update
	aws --version

	# - name: Upload results to IBM COS
	# env:
	# AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
	# AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	# run: \|
	# aws configure set default.s3.signature_version s3v4
	# aws s3 cp "$INPUT_DIR" "s3://${{ secrets.COS_BUCKET_NAME }}/$OUTPUT_DIR/" \
	# --recursive --endpoint-url ${{ secrets.COS_ENDPOINT_URL }} \|\| true

	# - name: Archive benchmark results as GitHub artifact
	# if: success() \|\| failure()
	# uses: actions/upload-artifact@v7.0.1
	# with:
	# name: ${{ env.OUTPUT_DIR }}
	# path: ${{ env.INPUT_DIR }}
	# retention-days: 14

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CI - Nightly Run Benchmark on GKE #1

Workflow file

CI - Nightly Run Benchmark on GKE #1

Uh oh!

Workflow file for this run