Skip to content

Benchmark: openshift | Qwen/Qwen3-0.6B | feat/benchmark-phase3-openshift #219

Benchmark: openshift | Qwen/Qwen3-0.6B | feat/benchmark-phase3-openshift

Benchmark: openshift | Qwen/Qwen3-0.6B | feat/benchmark-phase3-openshift #219

Workflow file for this run

name: CI - Benchmark
run-name: >-
${{ github.event_name == 'workflow_dispatch'
&& format('Benchmark: {0} | {1} | {2}',
inputs.platform,
inputs.model_id || 'unsloth/Meta-Llama-3.1-8B',
github.ref_name)
|| format('Benchmark: PR #{0} | {1}',
github.event.issue.number,
github.event.comment.body) }}
concurrency:
group: >-
${{
github.event_name == 'issue_comment' &&
!contains(github.event.comment.body, '/benchmark kind') &&
!contains(github.event.comment.body, '/benchmark openshift')
&& format('benchmark-isolated-{0}', github.run_id)
|| format('benchmark-{0}',
github.event.issue.number
|| github.run_id)
}}
cancel-in-progress: true
on:
issue_comment:
types: [created]
workflow_dispatch:
inputs:
platform:
description: 'Platform: kind or openshift'
required: true
default: 'kind'
type: choice
options: [kind, openshift]
model_id:
description: 'Model to benchmark (HuggingFace ID)'
required: false
default: 'unsloth/Meta-Llama-3.1-8B'
type: string
jobs:
gate:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
outputs:
run_benchmark: ${{ steps.check.outputs.run_benchmark }}
platform: ${{ steps.check.outputs.platform }}
pr_number: ${{ steps.check.outputs.pr_number }}
pr_head_sha: ${{ steps.check.outputs.pr_head_sha }}
pr_head_repo: ${{ steps.check.outputs.pr_head_repo }}
steps:
- name: Check if benchmark requested
id: check
uses: actions/github-script@v7
with:
script: |
async function hasWriteAccess(username) {
try {
const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({
owner: context.repo.owner,
repo: context.repo.repo,
username: username
});
const privilegedRoles = ['admin', 'maintain', 'write'];
return privilegedRoles.includes(permission.permission);
} catch (e) {
console.log(`Could not get permissions for ${username}: ${e.message}`);
return false;
}
}
if (context.eventName !== 'issue_comment' && context.eventName !== 'workflow_dispatch') {
core.setOutput('run_benchmark', 'false');
return;
}
if (context.eventName === 'workflow_dispatch') {
const platform = context.payload.inputs.platform;
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
console.log(`Manual benchmark dispatch for ${platform}`);
core.setOutput('run_benchmark', 'true');
core.setOutput('platform', platform);
// Try to find a PR for the current branch so we can post results
const branch = context.ref.replace('refs/heads/', '');
const { data: prs } = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
head: `${context.repo.owner}:${branch}`,
state: 'open',
});
if (prs.length > 0) {
core.setOutput('pr_number', prs[0].number.toString());
core.setOutput('pr_head_sha', prs[0].head.sha);
console.log(`Found open PR #${prs[0].number} for branch ${branch}`);
} else {
console.log(`No open PR found for branch ${branch}, skipping PR outputs`);
}
return;
}
const comment = context.payload.comment.body.trim();
const issue = context.payload.issue;
if (!issue.pull_request) {
console.log('Comment is not on a PR, skipping');
core.setOutput('run_benchmark', 'false');
return;
}
const validCommands = ['/benchmark kind', '/benchmark openshift'];
if (!validCommands.includes(comment)) {
console.log(`Comment "${comment}" is not a valid benchmark command, skipping`);
core.setOutput('run_benchmark', 'false');
return;
}
const commenter = context.payload.comment.user.login;
const hasAccess = await hasWriteAccess(commenter);
if (!hasAccess) {
console.log(`User ${commenter} does not have write access, ignoring ${comment}`);
core.setOutput('run_benchmark', 'false');
return;
}
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: issue.number
});
const baseRepo = `${context.repo.owner}/${context.repo.repo}`;
const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo;
console.log(`/benchmark kind approved by ${commenter} for PR #${issue.number}`);
console.log(`PR head SHA: ${pr.head.sha}`);
await github.rest.reactions.createForIssueComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: context.payload.comment.id,
content: 'rocket'
});
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
const platform = comment.includes('openshift') ? 'OpenShift' : 'Kind';
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issue.number,
body: `🚀 **Benchmark (${platform})** triggered by \`${comment}\`\n\n[View the benchmark workflow run](${runUrl})`
});
core.setOutput('run_benchmark', 'true');
core.setOutput('platform', platform.toLowerCase());
core.setOutput('pr_number', issue.number.toString());
core.setOutput('pr_head_sha', pr.head.sha);
core.setOutput('pr_head_repo', headRepo);
build-image:
needs: gate
if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' || github.event.inputs.platform == 'openshift')
runs-on: ubuntu-latest
outputs:
image_tag: ${{ steps.build.outputs.image_tag }}
steps:
- name: Checkout source
uses: actions/checkout@v4
with:
ref: ${{ needs.gate.outputs.pr_head_sha }}
- name: Log in to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ secrets.CR_USER }}
password: ${{ secrets.CR_TOKEN }}
- name: Build and push image
id: build
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
GIT_REF: ${{ needs.gate.outputs.pr_head_sha }}
run: |
IMAGE_TAG="bench-$(printf '%s' "$GIT_REF" | cut -c1-8)"
FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}"
echo "Building image: $FULL_IMAGE"
make docker-build IMG="$FULL_IMAGE"
make docker-push IMG="$FULL_IMAGE"
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
benchmark-kind:
runs-on: ubuntu-latest
needs: [gate]
if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'kind' || github.event.inputs.platform == 'kind')
timeout-minutes: 45
permissions:
contents: write
statuses: write
pull-requests: write
actions: read
steps:
- name: Set pending status on PR head
if: github.event_name == 'issue_comment'
uses: actions/github-script@v7
with:
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: '${{ needs.gate.outputs.pr_head_sha }}',
state: 'pending',
target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
description: 'Benchmark running...',
context: '${{ github.workflow }} / benchmark-kind'
});
- name: Validate PR head SHA
if: github.event_name == 'issue_comment'
run: |
if [ -z "${{ needs.gate.outputs.pr_head_sha }}" ]; then
echo "::error::pr_head_sha is empty — refusing to fall back to main"
exit 1
fi
echo "Checkout will use PR head SHA: ${{ needs.gate.outputs.pr_head_sha }}"
- name: Checkout source
uses: actions/checkout@v4
with:
repository: ${{ needs.gate.outputs.pr_head_repo || github.repository }}
ref: ${{ needs.gate.outputs.pr_head_sha || github.sha }}
token: ${{ secrets.GITHUB_TOKEN }}
- name: Extract Go version from go.mod
run: sed -En 's/^go (.*)$/GO_VERSION=\1/p' go.mod >> $GITHUB_ENV
- name: Set up Go with cache
uses: actions/setup-go@v6
with:
go-version: "${{ env.GO_VERSION }}"
cache-dependency-path: ./go.sum
- name: Install dependencies
run: go mod download
- name: Install Kind
run: |
ARCH=$(uname -m)
case "$ARCH" in
x86_64) KIND_ARCH="amd64" ;;
aarch64) KIND_ARCH="arm64" ;;
*) echo "Unsupported architecture: $ARCH"; exit 1 ;;
esac
curl -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.25.0/kind-linux-${KIND_ARCH}"
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind
kind version
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build WVA image locally
id: build-image
env:
CHECKOUT_SHA: ${{ needs.gate.outputs.pr_head_sha }}
run: |
IMAGE_NAME="llm-d-workload-variant-autoscaler"
IMAGE_TAG="bench-${CHECKOUT_SHA:0:7}"
FULL_IMAGE="localhost/${IMAGE_NAME}:${IMAGE_TAG}"
echo "Building local image: $FULL_IMAGE"
make docker-build IMG="$FULL_IMAGE"
echo "image=$FULL_IMAGE" >> $GITHUB_OUTPUT
- name: Deploy e2e infrastructure
env:
ENVIRONMENT: kind-emulator
USE_SIMULATOR: "true"
CREATE_CLUSTER: "true"
INSTALL_GATEWAY_CTRLPLANE: "true"
E2E_TESTS_ENABLED: "true"
IMG: ${{ steps.build-image.outputs.image }}
SKIP_BUILD: "true"
KV_SPARE_TRIGGER: "0.1"
QUEUE_SPARE_TRIGGER: "3"
INSTALL_GRAFANA: "true"
run: make deploy-e2e-infra
- name: Run benchmark
env:
ENVIRONMENT: kind-emulator
USE_SIMULATOR: "true"
SCALER_BACKEND: prometheus-adapter
BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json
BENCHMARK_GRAFANA_ENABLED: "true"
BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt
BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json
BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels
KV_SPARE_TRIGGER: "0.1"
QUEUE_SPARE_TRIGGER: "3"
run: make test-benchmark
- name: Upload benchmark results
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: |
/tmp/benchmark-results.json
/tmp/prefill-benchmark-results.json
/tmp/benchmark-grafana-snapshot.txt
/tmp/benchmark-grafana-snapshot.json
/tmp/benchmark-panels/
if-no-files-found: warn
- name: Post benchmark results as PR comment
if: always() && github.event_name == 'issue_comment'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}');
const sha = '${{ needs.gate.outputs.pr_head_sha }}';
const runId = context.runId;
const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`;
// Look up the uploaded artifact to get a direct download link
let artifactUrl = `${repoUrl}/actions/runs/${runId}`;
try {
const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({
owner: context.repo.owner,
repo: context.repo.repo,
run_id: runId
});
const benchArtifact = artifacts.find(a => a.name === 'benchmark-results');
if (benchArtifact) {
artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`;
}
} catch (e) {
console.log(`Could not look up artifact: ${e.message}`);
}
let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.';
try {
const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8'));
const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`;
resultsTable = `| Metric | Value |
|--------|-------|
| Scale-up time | ${fmtTime(data.scaleUpTimeSec)} |
| Scale-down time | ${fmtTime(data.scaleDownTimeSec)} |
| Max replicas | ${data.maxReplicas} |
| Avg KV cache usage | ${data.avgKVCacheUsage.toFixed(3)} |
| Avg queue depth | ${data.avgQueueDepth.toFixed(1)} |
| Replica oscillation (σ) | ${data.replicaOscillation.toFixed(2)} |
| Total duration | ${data.totalDurationSec.toFixed(0)}s |`;
} catch (e) {
console.log(`Could not read results: ${e.message}`);
}
// Upload panel PNGs as release assets and collect URLs for embedding
let panelImages = '';
const panelDir = '/tmp/benchmark-panels';
const hasPanels = fs.existsSync(panelDir) &&
fs.readdirSync(panelDir).some(f => f.endsWith('.png'));
if (hasPanels) {
const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort();
const tag = `benchmark-run-${runId}`;
try {
// Create a lightweight release to host panel images
const release = await github.rest.repos.createRelease({
owner: context.repo.owner,
repo: context.repo.repo,
tag_name: tag,
name: `Benchmark panels (PR #${prNumber}, ${sha.substring(0, 7)})`,
body: `Auto-generated by benchmark CI run #${runId}`,
draft: false,
prerelease: true
});
const imageUrls = [];
for (const png of pngs) {
const filePath = path.join(panelDir, png);
const fileData = fs.readFileSync(filePath);
const asset = await github.rest.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release.data.id,
name: png,
data: fileData,
headers: { 'content-type': 'image/png' }
});
const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' ');
imageUrls.push(`#### ${title}\n![${title}](${asset.data.browser_download_url})`);
console.log(`Uploaded ${png}: ${asset.data.browser_download_url}`);
}
if (imageUrls.length > 0) {
panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`;
}
} catch (e) {
console.log(`Could not upload panel images: ${e.message}`);
}
}
// Check for Grafana snapshot
const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json');
let artifactsSection = '';
if (hasSnapshotJson || hasPanels) {
const items = [];
if (hasSnapshotJson) {
items.push('Grafana snapshot JSON');
}
artifactsSection = `\n\n📎 **[Download artifacts](${artifactUrl})**${items.length ? ' — ' + items.join(', ') : ''}`;
}
const body = `## Benchmark: scale-up-latency (Kind)
${resultsTable}${panelImages}${artifactsSection}
<details>
<summary>Environment</summary>
- Cluster: Kind (emulated GPUs)
- Model: unsloth/Meta-Llama-3.1-8B (simulator)
- Commit: ${sha.substring(0, 7)}
- Scaler: prometheus-adapter
- [Workflow run](${repoUrl}/actions/runs/${runId})
</details>`;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: body
});
- name: Cleanup Kind cluster
if: always()
run: kind delete cluster --name kind-wva-gpu-cluster || true
benchmark-openshift:
runs-on: [self-hosted, openshift, vllm-d]
needs: [gate, build-image]
if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' || github.event.inputs.platform == 'openshift')
timeout-minutes: 60
permissions:
contents: write
statuses: write
pull-requests: write
actions: read
env:
MODEL_ID: ${{ inputs.model_id || 'unsloth/Meta-Llama-3.1-8B' }}
ACCELERATOR_TYPE: 'H100'
GOTOOLCHAIN: auto
LLMD_NAMESPACE: llm-d-benchmark-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
WVA_NAMESPACE: wva-benchmark-pr-${{ needs.gate.outputs.pr_number || github.run_id }}
WVA_RELEASE_NAME: wva-bench-${{ github.run_id }}
WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }}
steps:
- name: Set pending status on PR head
if: github.event_name == 'issue_comment'
uses: actions/github-script@v7
with:
script: |
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: '${{ needs.gate.outputs.pr_head_sha }}',
state: 'pending',
target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
description: 'Benchmark running on OpenShift...',
context: '${{ github.workflow }} / benchmark-openshift'
});
- name: Checkout source
uses: actions/checkout@v4
with:
repository: ${{ needs.gate.outputs.pr_head_repo || github.repository }}
ref: ${{ needs.gate.outputs.pr_head_sha || github.sha }}
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Go
uses: actions/setup-go@v6
with:
go-version: "1.25.x"
cache-dependency-path: ./go.sum
- name: Install tools (kubectl, oc, helm, make)
run: |
sudo apt-get update && sudo apt-get install -y make
KUBECTL_VERSION="v1.31.0"
curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz"
tar -xzf openshift-client-linux.tar.gz
sudo mv oc /usr/local/bin/
rm -f openshift-client-linux.tar.gz kubectl README.md
curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
- name: Verify cluster access
run: |
kubectl cluster-info
kubectl get nodes
- name: Get HF token from cluster secret
id: hf-token
run: |
HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' | base64 -d)
echo "::add-mask::$HF_TOKEN"
echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV
- name: Clean up resources for this PR
run: |
for ns in "$LLMD_NAMESPACE" "$WVA_NAMESPACE"; do
if kubectl get namespace "$ns" &>/dev/null; then
kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true
kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true
for release in $(helm list -n "$ns" -q 2>/dev/null); do
helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s || true
done
kubectl delete namespace "$ns" --ignore-not-found --timeout=60s || true
fi
done
- name: Apply latest CRDs
run: kubectl apply -f charts/workload-variant-autoscaler/crds/
- name: Deploy WVA and llm-d infrastructure
env:
ENVIRONMENT: openshift
INSTALL_GATEWAY_CTRLPLANE: "false"
E2E_TESTS_ENABLED: "true"
NAMESPACE_SCOPED: "false"
LLMD_NS: ${{ env.LLMD_NAMESPACE }}
WVA_NS: ${{ env.WVA_NAMESPACE }}
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
DEPLOY_VA: "false"
DEPLOY_HPA: "false"
DECODE_REPLICAS: "1"
MONITORING_NAMESPACE: openshift-user-workload-monitoring
WVA_METRICS_SECURE: "false"
KV_SPARE_TRIGGER: "0.1"
QUEUE_SPARE_TRIGGER: "3"
VLLM_SVC_PORT: "8000"
INSTALL_GRAFANA: "true"
run: |
./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift
- name: Label namespaces for OpenShift monitoring
run: |
kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite
kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite
- name: Wait for infrastructure to be ready
run: |
kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s || true
kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s || true
echo "--- Services in openshift-user-workload-monitoring ---"
kubectl get svc -n openshift-user-workload-monitoring
echo "--- Services in openshift-monitoring ---"
kubectl get svc -n openshift-monitoring
- name: Run benchmark
env:
ENVIRONMENT: openshift
USE_SIMULATOR: "false"
SCALER_BACKEND: prometheus-adapter
CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }}
E2E_MONITORING_NAMESPACE: openshift-user-workload-monitoring
E2E_EMULATED_LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }}
CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }}
BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json
BENCHMARK_GRAFANA_ENABLED: "true"
BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt
BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json
BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels
KV_SPARE_TRIGGER: "0.1"
QUEUE_SPARE_TRIGGER: "3"
run: |
# Get token for Thanos querier
export PROMETHEUS_TOKEN=$(kubectl create token prometheus-k8s -n openshift-monitoring --duration=24h 2>/dev/null || echo "")
# Start APIService guard: KEDA on this cluster continuously reclaims the
# external.metrics.k8s.io APIService. This background loop re-patches it
# every 8 seconds so the HPA can read wva_desired_replicas during the benchmark.
# Key fix: caBundle must be set to null because KEDA sets it, and Kubernetes
# rejects insecureSkipTLSVerify=true when caBundle is present.
MONITORING_NS="openshift-user-workload-monitoring"
(
while true; do
sleep 8
current_svc=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.name}' 2>/dev/null)
current_ns=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.namespace}' 2>/dev/null)
if [ "$current_svc" != "prometheus-adapter" ] || [ "$current_ns" != "$MONITORING_NS" ]; then
echo "[apiservice-guard] KEDA reclaimed (now: $current_svc/$current_ns), re-patching..."
kubectl patch apiservice v1beta1.external.metrics.k8s.io --type=merge -p "{
\"spec\": {
\"caBundle\": null,
\"insecureSkipTLSVerify\": true,
\"service\": {
\"name\": \"prometheus-adapter\",
\"namespace\": \"$MONITORING_NS\"
}
}
}" 2>&1 || true
fi
done
) &
GUARD_PID=$!
echo "APIService guard started (PID=$GUARD_PID)"
# Give guard time to do initial patch if needed
sleep 12
echo "Checking external metrics API..."
kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1" | head -1 && echo "External metrics API: OK" || echo "WARNING: External metrics API not available"
TEST_EXIT=0
make test-benchmark || TEST_EXIT=$?
kill $GUARD_PID 2>/dev/null || true
exit $TEST_EXIT
- name: Generate benchmark plots
if: always()
run: |
echo "Installing matplotlib and numpy..."
if python3 -m venv /tmp/plot-venv 2>&1; then
/tmp/plot-venv/bin/pip install --quiet matplotlib numpy 2>&1
PYTHON=/tmp/plot-venv/bin/python3
else
echo "venv failed, using PIP_BREAK_SYSTEM_PACKAGES fallback..."
curl -sSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py
PIP_BREAK_SYSTEM_PACKAGES=1 python3 /tmp/get-pip.py --user 2>&1
PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip install --user matplotlib numpy 2>&1
PYTHON=python3
fi
$PYTHON - <<'PLOTEOF'
import json, os, sys
try:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
except ImportError:
print("matplotlib not available, skipping plot generation")
sys.exit(0)
PANEL_DIR = '/tmp/benchmark-panels'
PREFILL_FILE = '/tmp/prefill-benchmark-results.json'
os.makedirs(PANEL_DIR, exist_ok=True)
if not os.path.exists(PREFILL_FILE):
print("No prefill results found, skipping plots")
sys.exit(0)
with open(PREFILL_FILE) as f:
results = json.load(f)
if not isinstance(results, list) or len(results) < 2:
print("Need at least 2 results (HPA + WVA) for comparison plots")
sys.exit(0)
hpa = next((r for r in results if r['autoscaler_type'] == 'HPA'), None)
wva = next((r for r in results if r['autoscaler_type'] == 'WVA'), None)
if not hpa or not wva:
print("Missing HPA or WVA results")
sys.exit(0)
plt.rcParams.update({
'figure.facecolor': 'white', 'axes.facecolor': '#f8f9fa',
'axes.grid': True, 'grid.alpha': 0.3, 'font.size': 12,
'axes.titlesize': 14, 'axes.labelsize': 12, 'figure.dpi': 150,
})
HPA_C, WVA_C = '#e74c3c', '#2ecc71'
EMPTY_METRIC = {'mean': 0, 'count': 0, 'percentiles': {k: 0 for k in ['p05','p10','p25','p50','p75','p90','p95','p99']}}
def m(data, key):
"""Safely get a metric dict, returning EMPTY_METRIC if absent."""
v = data.get(key)
if isinstance(v, dict):
if 'percentiles' not in v:
v['percentiles'] = EMPTY_METRIC['percentiles']
return v
return EMPTY_METRIC
def bar_pair(ax, hv, wv, title, ylabel, fmt='.1f'):
bars = ax.bar(['HPA', 'WVA'], [hv, wv], color=[HPA_C, WVA_C], width=0.5, edgecolor='white', linewidth=1.5)
ax.set_title(title, fontweight='bold')
ax.set_ylabel(ylabel)
for bar, val in zip(bars, [hv, wv]):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + ax.get_ylim()[1]*0.02,
f'{val:{fmt}}', ha='center', va='bottom', fontweight='bold', fontsize=11)
# 1. Comparison bars (3x3 grid)
fig, axes = plt.subplots(3, 3, figsize=(20, 13))
fig.suptitle('HPA vs WVA — Prefill-Heavy Workload (OpenShift)', fontsize=16, fontweight='bold', y=1.02)
bar_pair(axes[0,0], m(hpa,'throughput')['mean'], m(wva,'throughput')['mean'], 'Mean Throughput', 'tokens/sec', '.0f')
bar_pair(axes[0,1], m(hpa,'ttft')['count'], m(wva,'ttft')['count'], 'Completed Requests', 'count', 'd')
bar_pair(axes[0,2], hpa['max_replicas'], wva['max_replicas'], 'Max Replicas', 'replicas', 'd')
bar_pair(axes[1,0], hpa['avg_kv_cache'], wva['avg_kv_cache'], 'Avg KV Cache', 'utilization', '.3f')
bar_pair(axes[1,1], hpa['avg_queue_depth'], wva['avg_queue_depth'], 'Avg vLLM Queue', 'requests', '.0f')
bar_pair(axes[1,2], hpa.get('avg_epp_queue_depth',0), wva.get('avg_epp_queue_depth',0), 'Avg EPP Queue', 'requests', '.0f')
bar_pair(axes[2,0], m(hpa,'itl')['mean'], m(wva,'itl')['mean'], 'Mean ITL', 'ms', '.2f')
bar_pair(axes[2,1], m(hpa,'ttft')['mean']/1000, m(wva,'ttft')['mean']/1000, 'Mean TTFT', 'seconds', '.1f')
bar_pair(axes[2,2], hpa['avg_replicas'], wva['avg_replicas'], 'Avg Replicas', 'replicas', '.1f')
fig.tight_layout()
fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-comparison.png'), bbox_inches='tight', dpi=150)
plt.close()
# 2. Replica timeline overlay
fig, ax = plt.subplots(figsize=(14, 5))
for data, label, color in [(hpa, 'HPA', HPA_C), (wva, 'WVA', WVA_C)]:
tl = data['replica_timeline']
times = [s['elapsed_sec'] for s in tl]
ready = [s['ready_replicas'] for s in tl]
ax.step(times, ready, where='post', label=f'{label} (ready)', color=color, linewidth=2.5)
ax.fill_between(times, ready, step='post', alpha=0.1, color=color)
ax.set_title('Ready Replicas Over Time — HPA vs WVA', fontsize=14, fontweight='bold')
ax.set_xlabel('Time (seconds)')
ax.set_ylabel('Ready Replicas')
ax.legend(fontsize=12)
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
ax.set_ylim(0, max(wva['max_replicas'], hpa['max_replicas']) + 1)
fig.tight_layout()
fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-replica-timeline.png'), bbox_inches='tight', dpi=150)
plt.close()
# 3. Queue depth (vLLM + EPP) + KV cache over time
fig, (ax_qd, ax_epp, ax_kv) = plt.subplots(3, 1, figsize=(14, 11), sharex=True)
fig.suptitle('Queue Depth & KV Cache Over Time — HPA vs WVA', fontsize=15, fontweight='bold')
for data, label, color in [(hpa, 'HPA', HPA_C), (wva, 'WVA', WVA_C)]:
mt = data.get('metrics_timeline', [])
if mt:
times = [s['elapsed_sec'] for s in mt]
qd = [s['queue_depth'] for s in mt]
epp_qd = [s.get('epp_queue_depth', 0) for s in mt]
kv = [s['kv_cache'] for s in mt]
ax_qd.plot(times, qd, label=label, color=color, linewidth=2, alpha=0.85)
ax_qd.fill_between(times, qd, alpha=0.1, color=color)
ax_epp.plot(times, epp_qd, label=label, color=color, linewidth=2, alpha=0.85)
ax_epp.fill_between(times, epp_qd, alpha=0.1, color=color)
ax_kv.plot(times, kv, label=label, color=color, linewidth=2, alpha=0.85)
ax_kv.fill_between(times, kv, alpha=0.1, color=color)
ax_qd.set_title('vLLM Queue Depth (vllm:num_requests_waiting)', fontweight='bold')
ax_qd.set_ylabel('Waiting Requests')
ax_qd.legend(fontsize=11)
ax_epp.set_title('EPP Queue Depth (inference_extension_flow_control_queue_size)', fontweight='bold')
ax_epp.set_ylabel('Queued Requests')
ax_epp.legend(fontsize=11)
ax_kv.set_title('KV Cache Utilization (vllm:kv_cache_usage_perc)', fontweight='bold')
ax_kv.set_ylabel('Utilization')
ax_kv.set_xlabel('Time (seconds)')
ax_kv.legend(fontsize=11)
fig.tight_layout()
fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-metrics-timeline.png'), bbox_inches='tight', dpi=150)
plt.close()
# 4. Throughput percentile distribution
pct_keys = ['p05', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99']
pct_labels = ['p5', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99']
x = np.arange(len(pct_labels))
w = 0.35
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Percentile Distributions — HPA vs WVA', fontsize=15, fontweight='bold')
for ax, metric, unit, div in [(ax1,'ttft','seconds',1000),(ax2,'itl','ms',1),(ax3,'throughput','tok/s',1)]:
hv = [m(hpa, metric)['percentiles'].get(k, 0)/div for k in pct_keys]
wv = [m(wva, metric)['percentiles'].get(k, 0)/div for k in pct_keys]
ax.bar(x - w/2, hv, w, label='HPA', color=HPA_C, alpha=0.85)
ax.bar(x + w/2, wv, w, label='WVA', color=WVA_C, alpha=0.85)
ax.set_xticks(x); ax.set_xticklabels(pct_labels, fontsize=9)
ax.set_title(f'{metric.upper()} Percentiles', fontweight='bold')
ax.set_ylabel(unit); ax.legend(fontsize=9)
fig.tight_layout()
fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-percentiles.png'), bbox_inches='tight', dpi=150)
plt.close()
print(f"Generated benchmark plots in {PANEL_DIR}")
for f in sorted(os.listdir(PANEL_DIR)):
if f.endswith('.png'):
print(f" {f}")
# --- Generate per-autoscaler PDF reports (colleague format) ---
from matplotlib.backends.backend_pdf import PdfPages
import textwrap
model_id = os.environ.get('MODEL_ID', 'unknown')
model_short = model_id.split('/')[-1]
for data in [hpa, wva]:
atype = data['autoscaler_type']
pdf_path = os.path.join(PANEL_DIR, f'report_{atype.lower()}_{model_short}.pdf')
with PdfPages(pdf_path) as pdf:
# Page 1: Configuration & Summary
fig, ax = plt.subplots(figsize=(11, 8.5))
ax.axis('off')
lines = []
sep = '='*80
dash = '-'*80
va_cfg = data.get('va_config', 'N/A')
hpa_cfg = data.get('hpa_config', 'N/A')
data_model = data.get('model_id', model_id)
if atype == 'WVA':
atype_label = 'Workload Variant Autoscaler (WVA)'
else:
atype_label = 'HPA Baseline (VA-constrained + HPA)'
lines.append(sep)
lines.append(f"AUTOSCALER TYPE : {atype_label}")
lines.append(f"MODEL : {data_model}")
lines.append(sep)
lines.append(f"Autoscaler Configuration")
lines.append(dash)
lines.append(f" Variant (VA) : {va_cfg}")
lines.append(f" HPA : {hpa_cfg}")
lines.append(sep)
lines.append(f"Benchmark Load Generator Configuration")
lines.append(dash)
lines.append(f" Profile : poisson @ 20 req/s")
lines.append(f" Prompt tokens : 4000 | Output tokens: 1000")
lines.append(f" Max seconds : 600 | Seed: 42")
lines.append(sep)
lines.append(f"EPP Configuration")
lines.append(dash)
lines.append(f" Flow Control : ENABLED")
lines.append(f" Scorers : queue-scorer=2, kv-cache-utilization-scorer=2, prefix-cache-scorer=3")
lines.append(sep)
lines.append(f"Results Summary")
lines.append(dash)
tp_obj = data.get('throughput', {})
ttft_obj = data.get('ttft', {})
itl_obj = data.get('itl', {})
tp_mean = tp_obj.get('mean', 0) if isinstance(tp_obj, dict) else 0
ttft_mean = ttft_obj.get('mean', 0) if isinstance(ttft_obj, dict) else 0
ttft_p50 = ttft_obj.get('percentiles', {}).get('p50', 0) if isinstance(ttft_obj, dict) else 0
ttft_p99 = ttft_obj.get('percentiles', {}).get('p99', 0) if isinstance(ttft_obj, dict) else 0
itl_mean = itl_obj.get('mean', 0) if isinstance(itl_obj, dict) else 0
itl_p50 = itl_obj.get('percentiles', {}).get('p50', 0) if isinstance(itl_obj, dict) else 0
itl_p99 = itl_obj.get('percentiles', {}).get('p99', 0) if isinstance(itl_obj, dict) else 0
completed = ttft_obj.get('count', 0) if isinstance(ttft_obj, dict) else 0
error_count = data.get('error_count', 0)
incomplete_count = data.get('incomplete_count', 0)
achieved_rps = data.get('achieved_rps', 0)
lines.append(f" Completed Requests : {completed}")
lines.append(f" Failed Requests : {error_count}")
lines.append(f" Incomplete Requests : {incomplete_count}")
lines.append(f" Achieved RPS : {achieved_rps:.2f}")
lines.append(f" Throughput (mean) : {tp_mean:.1f} tok/s")
lines.append(f" Max Replicas : {data['max_replicas']}")
lines.append(f" Avg Replicas : {data['avg_replicas']:.2f}")
lines.append(f" Avg vLLM Queue : {data['avg_queue_depth']:.1f}")
lines.append(f" Avg EPP Queue : {data.get('avg_epp_queue_depth', 0):.1f}")
lines.append(f" Avg KV Cache : {data['avg_kv_cache']*100:.2f}%")
lines.append(dash)
lines.append(f" TTFT mean={ttft_mean/1000:.2f}s p50={ttft_p50/1000:.2f}s p99={ttft_p99/1000:.2f}s")
lines.append(f" ITL mean={itl_mean:.2f}ms p50={itl_p50:.2f}ms p99={itl_p99:.2f}ms")
lines.append(f" Duration: {data['duration_sec']:.0f}s")
lines.append(sep)
ax.text(0.05, 0.95, '\n'.join(lines), transform=ax.transAxes, fontsize=8.5,
verticalalignment='top', fontfamily='monospace',
bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8))
fig.suptitle(f'{atype} Benchmark Report — {model_short}', fontsize=14, fontweight='bold')
pdf.savefig(fig, bbox_inches='tight')
plt.close()
# Page 2: Time-series charts (KV Cache, Queue, Replicas, EPP Queue)
mt = data.get('metrics_timeline', [])
tl = data.get('replica_timeline', [])
if mt and tl:
fig, axes = plt.subplots(4, 1, figsize=(11, 14), sharex=True)
fig.suptitle(f'{atype} — Metrics Over Time ({model_short})', fontsize=14, fontweight='bold')
color = WVA_C if atype == 'WVA' else HPA_C
times_m = [s['elapsed_sec'] for s in mt]
kv = [s['kv_cache']*100 for s in mt]
qd = [s['queue_depth'] for s in mt]
epp = [s.get('epp_queue_depth', 0) for s in mt]
times_r = [s['elapsed_sec'] for s in tl]
ready = [s['ready_replicas'] for s in tl]
axes[0].plot(times_m, kv, color=color, linewidth=2)
axes[0].fill_between(times_m, kv, alpha=0.15, color=color)
axes[0].set_ylabel('KV Cache Usage (%)')
axes[0].set_title('KV Cache Usage Over Time')
axes[1].plot(times_m, qd, color=color, linewidth=2)
axes[1].fill_between(times_m, qd, alpha=0.15, color=color)
axes[1].set_ylabel('Requests Waiting')
axes[1].set_title('Number of Requests Waiting Over Time')
axes[2].step(times_r, ready, where='post', color=color, linewidth=2.5, label='Actual Replicas')
axes[2].fill_between(times_r, ready, step='post', alpha=0.1, color=color)
ax2b = axes[2].twinx()
ax2b.plot(times_m, epp, color='#3498db', linewidth=1.5, alpha=0.7, label='EPP Queue')
ax2b.set_ylabel('EPP Queue Size', color='#3498db')
axes[2].set_ylabel('Replica Count')
axes[2].set_title('Replica Count & EPP Queue Over Time')
axes[2].yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
axes[2].legend(loc='upper left', fontsize=9)
ax2b.legend(loc='upper right', fontsize=9)
axes[3].plot(times_m, epp, color='#3498db', linewidth=2)
axes[3].fill_between(times_m, epp, alpha=0.15, color='#3498db')
axes[3].set_ylabel('EPP Queue Size')
axes[3].set_xlabel('Time (seconds)')
axes[3].set_title('EPP Flow Control Queue Size Over Time')
fig.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()
# Page 3: GuideLLM metrics (TTFT, ITL, Throughput distributions)
has_pcts = isinstance(ttft_obj, dict) and 'percentiles' in ttft_obj
if has_pcts:
fig, axes = plt.subplots(2, 2, figsize=(11, 8.5))
fig.suptitle(f'{atype} — GuideLLM Latency & Throughput ({model_short})', fontsize=14, fontweight='bold')
pct_keys = ['p05','p10','p25','p50','p75','p90','p95','p99']
pct_labels = ['p5','p10','p25','p50','p75','p90','p95','p99']
x = np.arange(len(pct_labels))
ttft_vals = [ttft_obj.get('percentiles', {}).get(k, 0)/1000 for k in pct_keys]
axes[0,0].bar(x, ttft_vals, color=color, alpha=0.85)
axes[0,0].set_xticks(x); axes[0,0].set_xticklabels(pct_labels, fontsize=8)
axes[0,0].set_title('TTFT Percentiles', fontweight='bold')
axes[0,0].set_ylabel('seconds')
axes[0,0].axhline(y=ttft_mean/1000, color='red', linestyle='--', label=f'mean={ttft_mean/1000:.1f}s')
axes[0,0].legend(fontsize=8)
itl_vals = [itl_obj.get('percentiles', {}).get(k, 0) for k in pct_keys]
axes[0,1].bar(x, itl_vals, color=color, alpha=0.85)
axes[0,1].set_xticks(x); axes[0,1].set_xticklabels(pct_labels, fontsize=8)
axes[0,1].set_title('ITL Percentiles', fontweight='bold')
axes[0,1].set_ylabel('ms')
axes[0,1].axhline(y=itl_mean, color='red', linestyle='--', label=f'mean={itl_mean:.2f}ms')
axes[0,1].legend(fontsize=8)
tp_vals = [tp_obj.get('percentiles', {}).get(k, 0) for k in pct_keys]
axes[1,0].bar(x, tp_vals, color=color, alpha=0.85)
axes[1,0].set_xticks(x); axes[1,0].set_xticklabels(pct_labels, fontsize=8)
axes[1,0].set_title('Throughput Percentiles', fontweight='bold')
axes[1,0].set_ylabel('tok/s')
axes[1,0].axhline(y=tp_mean, color='red', linestyle='--', label=f'mean={tp_mean:.0f}')
axes[1,0].legend(fontsize=8)
axes[1,1].axis('off')
summary_lines = [
f"Completed : {completed}",
f"Failed : {error_count}",
f"Incomplete: {incomplete_count}",
f"RPS : {achieved_rps:.2f}",
f"",
f"Throughput: {tp_mean:.0f} tok/s",
f"TTFT mean : {ttft_mean/1000:.2f}s",
f"ITL mean : {itl_mean:.2f}ms",
f"",
f"Avg Replicas: {data['avg_replicas']:.2f}",
f"Max Replicas: {data['max_replicas']}",
f"Avg KV Cache: {data['avg_kv_cache']*100:.2f}%",
]
axes[1,1].text(0.1, 0.85, '\n'.join(summary_lines), transform=axes[1,1].transAxes,
fontsize=11, verticalalignment='top', fontfamily='monospace',
bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8))
axes[1,1].set_title('Summary', fontweight='bold')
fig.tight_layout()
pdf.savefig(fig, bbox_inches='tight')
plt.close()
print(f" Generated PDF report: {pdf_path}")
print(f"Generated benchmark plots in {PANEL_DIR}")
for f in sorted(os.listdir(PANEL_DIR)):
if f.endswith('.png') or f.endswith('.pdf'):
print(f" {f}")
PLOTEOF
- name: Upload benchmark results
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results-openshift
path: |
/tmp/benchmark-results.json
/tmp/prefill-benchmark-results.json
/tmp/benchmark-grafana-snapshot.txt
/tmp/benchmark-grafana-snapshot.json
/tmp/benchmark-panels/
if-no-files-found: warn
- name: Post benchmark results as PR comment
if: always() && (github.event_name == 'issue_comment' || needs.gate.outputs.pr_number != '')
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}');
const sha = '${{ needs.gate.outputs.pr_head_sha }}';
const runId = context.runId;
const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`;
let artifactUrl = `${repoUrl}/actions/runs/${runId}`;
try {
const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({
owner: context.repo.owner,
repo: context.repo.repo,
run_id: runId
});
const benchArtifact = artifacts.find(a => a.name === 'benchmark-results-openshift');
if (benchArtifact) {
artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`;
}
} catch (e) {
console.log(`Could not look up artifact: ${e.message}`);
}
let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.';
try {
const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8'));
const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`;
resultsTable = `| Metric | Value |
|--------|-------|
| Scale-up time | ${fmtTime(data.scaleUpTimeSec)} |
| Scale-down time | ${fmtTime(data.scaleDownTimeSec)} |
| Max replicas | ${data.maxReplicas} |
| Avg KV cache usage | ${data.avgKVCacheUsage.toFixed(3)} |
| Avg queue depth | ${data.avgQueueDepth.toFixed(1)} |
| Replica oscillation (σ) | ${data.replicaOscillation.toFixed(2)} |
| Total duration | ${data.totalDurationSec.toFixed(0)}s |`;
} catch (e) {
console.log(`Could not read results: ${e.message}`);
}
let prefillSection = '';
try {
const prefillData = JSON.parse(fs.readFileSync('/tmp/prefill-benchmark-results.json', 'utf8'));
if (Array.isArray(prefillData) && prefillData.length >= 2) {
const hpa = prefillData.find(r => r.autoscaler_type === 'HPA');
const wva = prefillData.find(r => r.autoscaler_type === 'WVA');
if (hpa && wva) {
const delta = (h, w, lowerBetter) => {
if (h === 0) return '—';
const pct = ((w - h) / Math.abs(h)) * 100;
const arrow = pct < 0 ? '↓' : '↑';
const sign = pct > 0 ? '+' : '';
return `${sign}${pct.toFixed(1)}% ${arrow}`;
};
const fmtP = (obj, key, div=1) => obj && obj.percentiles ? (obj.percentiles[key]/div).toFixed(1) : 'N/A';
const fmtM = (obj, div=1, prec=1) => obj ? (obj.mean/div).toFixed(prec) : 'N/A';
let table = `| Metric | HPA (Baseline) | WVA | Δ |
|--------|---------------|-----|---|
| **Max Replicas** | ${hpa.max_replicas} | **${wva.max_replicas}** | ${delta(hpa.max_replicas, wva.max_replicas)} |
| **Avg Replicas** | ${hpa.avg_replicas.toFixed(2)} | **${wva.avg_replicas.toFixed(2)}** | ${delta(hpa.avg_replicas, wva.avg_replicas)} |
| **Avg vLLM Queue Depth** | ${hpa.avg_queue_depth.toFixed(1)} | **${wva.avg_queue_depth.toFixed(1)}** | ${delta(hpa.avg_queue_depth, wva.avg_queue_depth)} |
| **Avg EPP Queue Depth** | ${(hpa.avg_epp_queue_depth||0).toFixed(1)} | **${(wva.avg_epp_queue_depth||0).toFixed(1)}** | ${delta(hpa.avg_epp_queue_depth||0, wva.avg_epp_queue_depth||0)} |
| **Avg KV Cache** | ${hpa.avg_kv_cache.toFixed(3)} | ${wva.avg_kv_cache.toFixed(3)} | ${delta(hpa.avg_kv_cache, wva.avg_kv_cache)} |
| **TTFT mean** | ${fmtM(hpa.ttft, 1000)}s | **${fmtM(wva.ttft, 1000)}s** | ${hpa.ttft && wva.ttft ? delta(hpa.ttft.mean, wva.ttft.mean) : '—'} |
| **TTFT p50** | ${fmtP(hpa.ttft, 'p50', 1000)}s | **${fmtP(wva.ttft, 'p50', 1000)}s** | — |
| **TTFT p99** | ${fmtP(hpa.ttft, 'p99', 1000)}s | **${fmtP(wva.ttft, 'p99', 1000)}s** | — |
| **ITL mean** | ${fmtM(hpa.itl, 1, 2)}ms | **${fmtM(wva.itl, 1, 2)}ms** | ${hpa.itl && wva.itl ? delta(hpa.itl.mean, wva.itl.mean) : '—'} |
| **Throughput mean** | ${fmtM(hpa.throughput)}tok/s | **${fmtM(wva.throughput)}tok/s** | ${hpa.throughput && wva.throughput ? delta(hpa.throughput.mean, wva.throughput.mean) : '—'} |
| **Throughput p50** | ${fmtP(hpa.throughput, 'p50')}tok/s | **${fmtP(wva.throughput, 'p50')}tok/s** | — |
| **Completed Requests** | ${hpa.ttft ? hpa.ttft.count : 'N/A'} | **${wva.ttft ? wva.ttft.count : 'N/A'}** | ${hpa.ttft && wva.ttft ? delta(hpa.ttft.count, wva.ttft.count) : '—'} |
| **Failed Requests** | ${hpa.error_count || 0} | ${wva.error_count || 0} | — |
| **Incomplete Requests** | ${hpa.incomplete_count || 0} | ${wva.incomplete_count || 0} | — |
| **Achieved RPS** | ${(hpa.achieved_rps || 0).toFixed(2)} | ${(wva.achieved_rps || 0).toFixed(2)} | — |
| **Duration** | ${hpa.duration_sec.toFixed(0)}s | ${wva.duration_sec.toFixed(0)}s | — |`;
let timelines = '';
for (const r of [hpa, wva]) {
if (r.replica_timeline && r.replica_timeline.length > 0) {
timelines += `\n<details>\n<summary>${r.autoscaler_type} Replica Timeline (${r.replica_timeline.length} snapshots)</summary>\n\n| Time (s) | Spec | Ready |\n|----------|------|-------|\n`;
for (const s of r.replica_timeline) {
timelines += `| ${s.elapsed_sec.toFixed(0)} | ${s.spec_replicas} | ${s.ready_replicas} |\n`;
}
timelines += `\n</details>\n`;
}
}
prefillSection = `\n\n---\n\n## Benchmark: prefill-heavy-workload (OpenShift)\n\n${table}\n${timelines}`;
}
} else if (Array.isArray(prefillData) && prefillData.length > 0) {
let rows = '';
for (const r of prefillData) {
rows += `\n### ${r.autoscaler_type}\n\n| Metric | Value |\n|--------|-------|\n| Duration | ${r.duration_sec.toFixed(0)}s |\n| Max Replicas | ${r.max_replicas} |\n| Avg Replicas | ${r.avg_replicas.toFixed(2)} |\n| Avg vLLM Queue Depth | ${r.avg_queue_depth.toFixed(2)} |\n| Avg EPP Queue Depth | ${(r.avg_epp_queue_depth||0).toFixed(2)} |\n| Avg KV Cache | ${r.avg_kv_cache.toFixed(3)} |\n`;
}
prefillSection = `\n\n---\n\n## Benchmark: prefill-heavy-workload (OpenShift)\n${rows}`;
}
} catch (e) {
console.log(`Could not read prefill results: ${e.message}`);
}
let panelImages = '';
const panelDir = '/tmp/benchmark-panels';
const hasPanels = fs.existsSync(panelDir) && fs.readdirSync(panelDir).some(f => f.endsWith('.png'));
if (hasPanels) {
const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort();
const tag = `benchmark-run-os-${runId}`;
try {
const release = await github.rest.repos.createRelease({
owner: context.repo.owner,
repo: context.repo.repo,
tag_name: tag,
name: `Benchmark panels OpenShift (PR #${prNumber}, ${sha.substring(0, 7)})`,
body: `Auto-generated by benchmark CI run #${runId}`,
draft: false,
prerelease: true
});
const imageUrls = [];
for (const png of pngs) {
const filePath = path.join(panelDir, png);
const fileData = fs.readFileSync(filePath);
const asset = await github.rest.repos.uploadReleaseAsset({
owner: context.repo.owner,
repo: context.repo.repo,
release_id: release.data.id,
name: png,
data: fileData,
headers: { 'content-type': 'image/png' }
});
const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' ');
imageUrls.push(`#### ${title}\n![${title}](${asset.data.browser_download_url})`);
}
if (imageUrls.length > 0) {
panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`;
}
} catch (e) {
console.log(`Could not upload panel images: ${e.message}`);
}
}
const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json');
let artifactsSection = '';
if (hasSnapshotJson || hasPanels) {
const items = [];
if (hasSnapshotJson) items.push('Grafana snapshot JSON');
artifactsSection = `\n\n📎 **[Download artifacts](${artifactUrl})**${items.length ? ' — ' + items.join(', ') : ''}`;
}
const body = `## Benchmark: scale-up-latency (OpenShift)
${resultsTable}${prefillSection}${panelImages}${artifactsSection}
<details>
<summary>Environment</summary>
- Cluster: OpenShift (Real GPUs)
- Model: ${process.env.MODEL_ID || 'unsloth/Meta-Llama-3.1-8B'}
- Accelerator: H100
- Commit: ${sha.substring(0, 7)}
- Scaler: prometheus-adapter
- [Workflow run](${repoUrl}/actions/runs/${runId})
</details>`;
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNumber,
body: body
});
- name: Cleanup infrastructure
if: always()
run: |
helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do
helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s || true
done
kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s || true
kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s || true
report-status:
runs-on: ubuntu-latest
needs: [gate, benchmark-kind, benchmark-openshift]
if: always() && needs.gate.outputs.run_benchmark == 'true'
permissions:
statuses: write
steps:
- name: Report status to PR
uses: actions/github-script@v7
with:
script: |
const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}';
const platform = '${{ needs.gate.outputs.platform }}';
let benchResult;
if (platform === 'openshift') {
benchResult = '${{ needs.benchmark-openshift.result }}';
} else {
benchResult = '${{ needs.benchmark-kind.result }}';
}
if (!prHeadSha) {
console.log('No PR head SHA available, skipping status report');
return;
}
let state, description;
if (benchResult === 'success') {
state = 'success';
description = 'Benchmark completed successfully';
} else if (benchResult === 'skipped') {
state = 'failure';
description = 'Benchmark did not run (prerequisite failed or skipped)';
} else if (benchResult === 'cancelled') {
state = 'failure';
description = 'Benchmark cancelled';
} else {
state = 'failure';
description = 'Benchmark failed';
}
console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`);
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: prHeadSha,
state: state,
target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`,
description: description,
context: '${{ github.workflow }} / benchmark-kind'
});
console.log('Status reported successfully');