Benchmark: openshift | Qwen/Qwen3-0.6B | feat/benchmark-phase3-openshift #219
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI - Benchmark | |
| run-name: >- | |
| ${{ github.event_name == 'workflow_dispatch' | |
| && format('Benchmark: {0} | {1} | {2}', | |
| inputs.platform, | |
| inputs.model_id || 'unsloth/Meta-Llama-3.1-8B', | |
| github.ref_name) | |
| || format('Benchmark: PR #{0} | {1}', | |
| github.event.issue.number, | |
| github.event.comment.body) }} | |
| concurrency: | |
| group: >- | |
| ${{ | |
| github.event_name == 'issue_comment' && | |
| !contains(github.event.comment.body, '/benchmark kind') && | |
| !contains(github.event.comment.body, '/benchmark openshift') | |
| && format('benchmark-isolated-{0}', github.run_id) | |
| || format('benchmark-{0}', | |
| github.event.issue.number | |
| || github.run_id) | |
| }} | |
| cancel-in-progress: true | |
| on: | |
| issue_comment: | |
| types: [created] | |
| workflow_dispatch: | |
| inputs: | |
| platform: | |
| description: 'Platform: kind or openshift' | |
| required: true | |
| default: 'kind' | |
| type: choice | |
| options: [kind, openshift] | |
| model_id: | |
| description: 'Model to benchmark (HuggingFace ID)' | |
| required: false | |
| default: 'unsloth/Meta-Llama-3.1-8B' | |
| type: string | |
| jobs: | |
| gate: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| outputs: | |
| run_benchmark: ${{ steps.check.outputs.run_benchmark }} | |
| platform: ${{ steps.check.outputs.platform }} | |
| pr_number: ${{ steps.check.outputs.pr_number }} | |
| pr_head_sha: ${{ steps.check.outputs.pr_head_sha }} | |
| pr_head_repo: ${{ steps.check.outputs.pr_head_repo }} | |
| steps: | |
| - name: Check if benchmark requested | |
| id: check | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| async function hasWriteAccess(username) { | |
| try { | |
| const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| username: username | |
| }); | |
| const privilegedRoles = ['admin', 'maintain', 'write']; | |
| return privilegedRoles.includes(permission.permission); | |
| } catch (e) { | |
| console.log(`Could not get permissions for ${username}: ${e.message}`); | |
| return false; | |
| } | |
| } | |
| if (context.eventName !== 'issue_comment' && context.eventName !== 'workflow_dispatch') { | |
| core.setOutput('run_benchmark', 'false'); | |
| return; | |
| } | |
| if (context.eventName === 'workflow_dispatch') { | |
| const platform = context.payload.inputs.platform; | |
| const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; | |
| console.log(`Manual benchmark dispatch for ${platform}`); | |
| core.setOutput('run_benchmark', 'true'); | |
| core.setOutput('platform', platform); | |
| // Try to find a PR for the current branch so we can post results | |
| const branch = context.ref.replace('refs/heads/', ''); | |
| const { data: prs } = await github.rest.pulls.list({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| head: `${context.repo.owner}:${branch}`, | |
| state: 'open', | |
| }); | |
| if (prs.length > 0) { | |
| core.setOutput('pr_number', prs[0].number.toString()); | |
| core.setOutput('pr_head_sha', prs[0].head.sha); | |
| console.log(`Found open PR #${prs[0].number} for branch ${branch}`); | |
| } else { | |
| console.log(`No open PR found for branch ${branch}, skipping PR outputs`); | |
| } | |
| return; | |
| } | |
| const comment = context.payload.comment.body.trim(); | |
| const issue = context.payload.issue; | |
| if (!issue.pull_request) { | |
| console.log('Comment is not on a PR, skipping'); | |
| core.setOutput('run_benchmark', 'false'); | |
| return; | |
| } | |
| const validCommands = ['/benchmark kind', '/benchmark openshift']; | |
| if (!validCommands.includes(comment)) { | |
| console.log(`Comment "${comment}" is not a valid benchmark command, skipping`); | |
| core.setOutput('run_benchmark', 'false'); | |
| return; | |
| } | |
| const commenter = context.payload.comment.user.login; | |
| const hasAccess = await hasWriteAccess(commenter); | |
| if (!hasAccess) { | |
| console.log(`User ${commenter} does not have write access, ignoring ${comment}`); | |
| core.setOutput('run_benchmark', 'false'); | |
| return; | |
| } | |
| const { data: pr } = await github.rest.pulls.get({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| pull_number: issue.number | |
| }); | |
| const baseRepo = `${context.repo.owner}/${context.repo.repo}`; | |
| const headRepo = pr.head.repo ? pr.head.repo.full_name : baseRepo; | |
| console.log(`/benchmark kind approved by ${commenter} for PR #${issue.number}`); | |
| console.log(`PR head SHA: ${pr.head.sha}`); | |
| await github.rest.reactions.createForIssueComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| comment_id: context.payload.comment.id, | |
| content: 'rocket' | |
| }); | |
| const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; | |
| const platform = comment.includes('openshift') ? 'OpenShift' : 'Kind'; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: issue.number, | |
| body: `🚀 **Benchmark (${platform})** triggered by \`${comment}\`\n\n[View the benchmark workflow run](${runUrl})` | |
| }); | |
| core.setOutput('run_benchmark', 'true'); | |
| core.setOutput('platform', platform.toLowerCase()); | |
| core.setOutput('pr_number', issue.number.toString()); | |
| core.setOutput('pr_head_sha', pr.head.sha); | |
| core.setOutput('pr_head_repo', headRepo); | |
| build-image: | |
| needs: gate | |
| if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' || github.event.inputs.platform == 'openshift') | |
| runs-on: ubuntu-latest | |
| outputs: | |
| image_tag: ${{ steps.build.outputs.image_tag }} | |
| steps: | |
| - name: Checkout source | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ needs.gate.outputs.pr_head_sha }} | |
| - name: Log in to GHCR | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ secrets.CR_USER }} | |
| password: ${{ secrets.CR_TOKEN }} | |
| - name: Build and push image | |
| id: build | |
| env: | |
| REGISTRY: ghcr.io | |
| IMAGE_NAME: ${{ github.repository }} | |
| GIT_REF: ${{ needs.gate.outputs.pr_head_sha }} | |
| run: | | |
| IMAGE_TAG="bench-$(printf '%s' "$GIT_REF" | cut -c1-8)" | |
| FULL_IMAGE="${REGISTRY}/${IMAGE_NAME}:${IMAGE_TAG}" | |
| echo "Building image: $FULL_IMAGE" | |
| make docker-build IMG="$FULL_IMAGE" | |
| make docker-push IMG="$FULL_IMAGE" | |
| echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT | |
| benchmark-kind: | |
| runs-on: ubuntu-latest | |
| needs: [gate] | |
| if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'kind' || github.event.inputs.platform == 'kind') | |
| timeout-minutes: 45 | |
| permissions: | |
| contents: write | |
| statuses: write | |
| pull-requests: write | |
| actions: read | |
| steps: | |
| - name: Set pending status on PR head | |
| if: github.event_name == 'issue_comment' | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| await github.rest.repos.createCommitStatus({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| sha: '${{ needs.gate.outputs.pr_head_sha }}', | |
| state: 'pending', | |
| target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, | |
| description: 'Benchmark running...', | |
| context: '${{ github.workflow }} / benchmark-kind' | |
| }); | |
| - name: Validate PR head SHA | |
| if: github.event_name == 'issue_comment' | |
| run: | | |
| if [ -z "${{ needs.gate.outputs.pr_head_sha }}" ]; then | |
| echo "::error::pr_head_sha is empty — refusing to fall back to main" | |
| exit 1 | |
| fi | |
| echo "Checkout will use PR head SHA: ${{ needs.gate.outputs.pr_head_sha }}" | |
| - name: Checkout source | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: ${{ needs.gate.outputs.pr_head_repo || github.repository }} | |
| ref: ${{ needs.gate.outputs.pr_head_sha || github.sha }} | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Extract Go version from go.mod | |
| run: sed -En 's/^go (.*)$/GO_VERSION=\1/p' go.mod >> $GITHUB_ENV | |
| - name: Set up Go with cache | |
| uses: actions/setup-go@v6 | |
| with: | |
| go-version: "${{ env.GO_VERSION }}" | |
| cache-dependency-path: ./go.sum | |
| - name: Install dependencies | |
| run: go mod download | |
| - name: Install Kind | |
| run: | | |
| ARCH=$(uname -m) | |
| case "$ARCH" in | |
| x86_64) KIND_ARCH="amd64" ;; | |
| aarch64) KIND_ARCH="arm64" ;; | |
| *) echo "Unsupported architecture: $ARCH"; exit 1 ;; | |
| esac | |
| curl -Lo ./kind "https://kind.sigs.k8s.io/dl/v0.25.0/kind-linux-${KIND_ARCH}" | |
| chmod +x ./kind | |
| sudo mv ./kind /usr/local/bin/kind | |
| kind version | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Build WVA image locally | |
| id: build-image | |
| env: | |
| CHECKOUT_SHA: ${{ needs.gate.outputs.pr_head_sha }} | |
| run: | | |
| IMAGE_NAME="llm-d-workload-variant-autoscaler" | |
| IMAGE_TAG="bench-${CHECKOUT_SHA:0:7}" | |
| FULL_IMAGE="localhost/${IMAGE_NAME}:${IMAGE_TAG}" | |
| echo "Building local image: $FULL_IMAGE" | |
| make docker-build IMG="$FULL_IMAGE" | |
| echo "image=$FULL_IMAGE" >> $GITHUB_OUTPUT | |
| - name: Deploy e2e infrastructure | |
| env: | |
| ENVIRONMENT: kind-emulator | |
| USE_SIMULATOR: "true" | |
| CREATE_CLUSTER: "true" | |
| INSTALL_GATEWAY_CTRLPLANE: "true" | |
| E2E_TESTS_ENABLED: "true" | |
| IMG: ${{ steps.build-image.outputs.image }} | |
| SKIP_BUILD: "true" | |
| KV_SPARE_TRIGGER: "0.1" | |
| QUEUE_SPARE_TRIGGER: "3" | |
| INSTALL_GRAFANA: "true" | |
| run: make deploy-e2e-infra | |
| - name: Run benchmark | |
| env: | |
| ENVIRONMENT: kind-emulator | |
| USE_SIMULATOR: "true" | |
| SCALER_BACKEND: prometheus-adapter | |
| BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json | |
| BENCHMARK_GRAFANA_ENABLED: "true" | |
| BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt | |
| BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json | |
| BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels | |
| KV_SPARE_TRIGGER: "0.1" | |
| QUEUE_SPARE_TRIGGER: "3" | |
| run: make test-benchmark | |
| - name: Upload benchmark results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results | |
| path: | | |
| /tmp/benchmark-results.json | |
| /tmp/prefill-benchmark-results.json | |
| /tmp/benchmark-grafana-snapshot.txt | |
| /tmp/benchmark-grafana-snapshot.json | |
| /tmp/benchmark-panels/ | |
| if-no-files-found: warn | |
| - name: Post benchmark results as PR comment | |
| if: always() && github.event_name == 'issue_comment' | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}'); | |
| const sha = '${{ needs.gate.outputs.pr_head_sha }}'; | |
| const runId = context.runId; | |
| const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`; | |
| // Look up the uploaded artifact to get a direct download link | |
| let artifactUrl = `${repoUrl}/actions/runs/${runId}`; | |
| try { | |
| const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| run_id: runId | |
| }); | |
| const benchArtifact = artifacts.find(a => a.name === 'benchmark-results'); | |
| if (benchArtifact) { | |
| artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`; | |
| } | |
| } catch (e) { | |
| console.log(`Could not look up artifact: ${e.message}`); | |
| } | |
| let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.'; | |
| try { | |
| const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8')); | |
| const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`; | |
| resultsTable = `| Metric | Value | | |
| |--------|-------| | |
| | Scale-up time | ${fmtTime(data.scaleUpTimeSec)} | | |
| | Scale-down time | ${fmtTime(data.scaleDownTimeSec)} | | |
| | Max replicas | ${data.maxReplicas} | | |
| | Avg KV cache usage | ${data.avgKVCacheUsage.toFixed(3)} | | |
| | Avg queue depth | ${data.avgQueueDepth.toFixed(1)} | | |
| | Replica oscillation (σ) | ${data.replicaOscillation.toFixed(2)} | | |
| | Total duration | ${data.totalDurationSec.toFixed(0)}s |`; | |
| } catch (e) { | |
| console.log(`Could not read results: ${e.message}`); | |
| } | |
| // Upload panel PNGs as release assets and collect URLs for embedding | |
| let panelImages = ''; | |
| const panelDir = '/tmp/benchmark-panels'; | |
| const hasPanels = fs.existsSync(panelDir) && | |
| fs.readdirSync(panelDir).some(f => f.endsWith('.png')); | |
| if (hasPanels) { | |
| const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort(); | |
| const tag = `benchmark-run-${runId}`; | |
| try { | |
| // Create a lightweight release to host panel images | |
| const release = await github.rest.repos.createRelease({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| tag_name: tag, | |
| name: `Benchmark panels (PR #${prNumber}, ${sha.substring(0, 7)})`, | |
| body: `Auto-generated by benchmark CI run #${runId}`, | |
| draft: false, | |
| prerelease: true | |
| }); | |
| const imageUrls = []; | |
| for (const png of pngs) { | |
| const filePath = path.join(panelDir, png); | |
| const fileData = fs.readFileSync(filePath); | |
| const asset = await github.rest.repos.uploadReleaseAsset({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| release_id: release.data.id, | |
| name: png, | |
| data: fileData, | |
| headers: { 'content-type': 'image/png' } | |
| }); | |
| const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' '); | |
| imageUrls.push(`#### ${title}\n`); | |
| console.log(`Uploaded ${png}: ${asset.data.browser_download_url}`); | |
| } | |
| if (imageUrls.length > 0) { | |
| panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`; | |
| } | |
| } catch (e) { | |
| console.log(`Could not upload panel images: ${e.message}`); | |
| } | |
| } | |
| // Check for Grafana snapshot | |
| const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json'); | |
| let artifactsSection = ''; | |
| if (hasSnapshotJson || hasPanels) { | |
| const items = []; | |
| if (hasSnapshotJson) { | |
| items.push('Grafana snapshot JSON'); | |
| } | |
| artifactsSection = `\n\n📎 **[Download artifacts](${artifactUrl})**${items.length ? ' — ' + items.join(', ') : ''}`; | |
| } | |
| const body = `## Benchmark: scale-up-latency (Kind) | |
| ${resultsTable}${panelImages}${artifactsSection} | |
| <details> | |
| <summary>Environment</summary> | |
| - Cluster: Kind (emulated GPUs) | |
| - Model: unsloth/Meta-Llama-3.1-8B (simulator) | |
| - Commit: ${sha.substring(0, 7)} | |
| - Scaler: prometheus-adapter | |
| - [Workflow run](${repoUrl}/actions/runs/${runId}) | |
| </details>`; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| body: body | |
| }); | |
| - name: Cleanup Kind cluster | |
| if: always() | |
| run: kind delete cluster --name kind-wva-gpu-cluster || true | |
| benchmark-openshift: | |
| runs-on: [self-hosted, openshift, vllm-d] | |
| needs: [gate, build-image] | |
| if: needs.gate.outputs.run_benchmark == 'true' && (needs.gate.outputs.platform == 'openshift' || github.event.inputs.platform == 'openshift') | |
| timeout-minutes: 60 | |
| permissions: | |
| contents: write | |
| statuses: write | |
| pull-requests: write | |
| actions: read | |
| env: | |
| MODEL_ID: ${{ inputs.model_id || 'unsloth/Meta-Llama-3.1-8B' }} | |
| ACCELERATOR_TYPE: 'H100' | |
| GOTOOLCHAIN: auto | |
| LLMD_NAMESPACE: llm-d-benchmark-pr-${{ needs.gate.outputs.pr_number || github.run_id }} | |
| WVA_NAMESPACE: wva-benchmark-pr-${{ needs.gate.outputs.pr_number || github.run_id }} | |
| WVA_RELEASE_NAME: wva-bench-${{ github.run_id }} | |
| WVA_IMAGE_TAG: ${{ needs.build-image.outputs.image_tag }} | |
| steps: | |
| - name: Set pending status on PR head | |
| if: github.event_name == 'issue_comment' | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| await github.rest.repos.createCommitStatus({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| sha: '${{ needs.gate.outputs.pr_head_sha }}', | |
| state: 'pending', | |
| target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, | |
| description: 'Benchmark running on OpenShift...', | |
| context: '${{ github.workflow }} / benchmark-openshift' | |
| }); | |
| - name: Checkout source | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: ${{ needs.gate.outputs.pr_head_repo || github.repository }} | |
| ref: ${{ needs.gate.outputs.pr_head_sha || github.sha }} | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Set up Go | |
| uses: actions/setup-go@v6 | |
| with: | |
| go-version: "1.25.x" | |
| cache-dependency-path: ./go.sum | |
| - name: Install tools (kubectl, oc, helm, make) | |
| run: | | |
| sudo apt-get update && sudo apt-get install -y make | |
| KUBECTL_VERSION="v1.31.0" | |
| curl -fsSL --retry 3 --retry-delay 5 -o kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" | |
| chmod +x kubectl | |
| sudo mv kubectl /usr/local/bin/ | |
| curl -fsSL --retry 3 --retry-delay 5 -O "https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz" | |
| tar -xzf openshift-client-linux.tar.gz | |
| sudo mv oc /usr/local/bin/ | |
| rm -f openshift-client-linux.tar.gz kubectl README.md | |
| curl -fsSL --retry 3 --retry-delay 5 https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash | |
| - name: Verify cluster access | |
| run: | | |
| kubectl cluster-info | |
| kubectl get nodes | |
| - name: Get HF token from cluster secret | |
| id: hf-token | |
| run: | | |
| HF_TOKEN=$(kubectl get secret llm-d-hf-token -n default -o jsonpath='{.data.HF_TOKEN}' | base64 -d) | |
| echo "::add-mask::$HF_TOKEN" | |
| echo "HF_TOKEN=$HF_TOKEN" >> $GITHUB_ENV | |
| - name: Clean up resources for this PR | |
| run: | | |
| for ns in "$LLMD_NAMESPACE" "$WVA_NAMESPACE"; do | |
| if kubectl get namespace "$ns" &>/dev/null; then | |
| kubectl delete hpa -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true | |
| kubectl delete variantautoscaling -n "$ns" -l app.kubernetes.io/name=workload-variant-autoscaler --ignore-not-found || true | |
| for release in $(helm list -n "$ns" -q 2>/dev/null); do | |
| helm uninstall "$release" -n "$ns" --ignore-not-found --wait --timeout 60s || true | |
| done | |
| kubectl delete namespace "$ns" --ignore-not-found --timeout=60s || true | |
| fi | |
| done | |
| - name: Apply latest CRDs | |
| run: kubectl apply -f charts/workload-variant-autoscaler/crds/ | |
| - name: Deploy WVA and llm-d infrastructure | |
| env: | |
| ENVIRONMENT: openshift | |
| INSTALL_GATEWAY_CTRLPLANE: "false" | |
| E2E_TESTS_ENABLED: "true" | |
| NAMESPACE_SCOPED: "false" | |
| LLMD_NS: ${{ env.LLMD_NAMESPACE }} | |
| WVA_NS: ${{ env.WVA_NAMESPACE }} | |
| CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }} | |
| DEPLOY_VA: "false" | |
| DEPLOY_HPA: "false" | |
| DECODE_REPLICAS: "1" | |
| MONITORING_NAMESPACE: openshift-user-workload-monitoring | |
| WVA_METRICS_SECURE: "false" | |
| KV_SPARE_TRIGGER: "0.1" | |
| QUEUE_SPARE_TRIGGER: "3" | |
| VLLM_SVC_PORT: "8000" | |
| INSTALL_GRAFANA: "true" | |
| run: | | |
| ./deploy/install.sh --model "$MODEL_ID" --accelerator "$ACCELERATOR_TYPE" --release-name "$WVA_RELEASE_NAME" --environment openshift | |
| - name: Label namespaces for OpenShift monitoring | |
| run: | | |
| kubectl label namespace "$LLMD_NAMESPACE" openshift.io/user-monitoring=true --overwrite | |
| kubectl label namespace "$WVA_NAMESPACE" openshift.io/user-monitoring=true --overwrite | |
| - name: Wait for infrastructure to be ready | |
| run: | | |
| kubectl rollout status deployment -l app.kubernetes.io/name=workload-variant-autoscaler -n "$WVA_NAMESPACE" --timeout=300s || true | |
| kubectl rollout status deployment/ms-inference-scheduling-llm-d-modelservice-decode -n "$LLMD_NAMESPACE" --timeout=1500s || true | |
| echo "--- Services in openshift-user-workload-monitoring ---" | |
| kubectl get svc -n openshift-user-workload-monitoring | |
| echo "--- Services in openshift-monitoring ---" | |
| kubectl get svc -n openshift-monitoring | |
| - name: Run benchmark | |
| env: | |
| ENVIRONMENT: openshift | |
| USE_SIMULATOR: "false" | |
| SCALER_BACKEND: prometheus-adapter | |
| CONTROLLER_NAMESPACE: ${{ env.WVA_NAMESPACE }} | |
| E2E_MONITORING_NAMESPACE: openshift-user-workload-monitoring | |
| E2E_EMULATED_LLMD_NAMESPACE: ${{ env.LLMD_NAMESPACE }} | |
| CONTROLLER_INSTANCE: ${{ env.WVA_NAMESPACE }} | |
| BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json | |
| BENCHMARK_GRAFANA_ENABLED: "true" | |
| BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt | |
| BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json | |
| BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels | |
| KV_SPARE_TRIGGER: "0.1" | |
| QUEUE_SPARE_TRIGGER: "3" | |
| run: | | |
| # Get token for Thanos querier | |
| export PROMETHEUS_TOKEN=$(kubectl create token prometheus-k8s -n openshift-monitoring --duration=24h 2>/dev/null || echo "") | |
| # Start APIService guard: KEDA on this cluster continuously reclaims the | |
| # external.metrics.k8s.io APIService. This background loop re-patches it | |
| # every 8 seconds so the HPA can read wva_desired_replicas during the benchmark. | |
| # Key fix: caBundle must be set to null because KEDA sets it, and Kubernetes | |
| # rejects insecureSkipTLSVerify=true when caBundle is present. | |
| MONITORING_NS="openshift-user-workload-monitoring" | |
| ( | |
| while true; do | |
| sleep 8 | |
| current_svc=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.name}' 2>/dev/null) | |
| current_ns=$(kubectl get apiservice v1beta1.external.metrics.k8s.io -o jsonpath='{.spec.service.namespace}' 2>/dev/null) | |
| if [ "$current_svc" != "prometheus-adapter" ] || [ "$current_ns" != "$MONITORING_NS" ]; then | |
| echo "[apiservice-guard] KEDA reclaimed (now: $current_svc/$current_ns), re-patching..." | |
| kubectl patch apiservice v1beta1.external.metrics.k8s.io --type=merge -p "{ | |
| \"spec\": { | |
| \"caBundle\": null, | |
| \"insecureSkipTLSVerify\": true, | |
| \"service\": { | |
| \"name\": \"prometheus-adapter\", | |
| \"namespace\": \"$MONITORING_NS\" | |
| } | |
| } | |
| }" 2>&1 || true | |
| fi | |
| done | |
| ) & | |
| GUARD_PID=$! | |
| echo "APIService guard started (PID=$GUARD_PID)" | |
| # Give guard time to do initial patch if needed | |
| sleep 12 | |
| echo "Checking external metrics API..." | |
| kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1" | head -1 && echo "External metrics API: OK" || echo "WARNING: External metrics API not available" | |
| TEST_EXIT=0 | |
| make test-benchmark || TEST_EXIT=$? | |
| kill $GUARD_PID 2>/dev/null || true | |
| exit $TEST_EXIT | |
| - name: Generate benchmark plots | |
| if: always() | |
| run: | | |
| echo "Installing matplotlib and numpy..." | |
| if python3 -m venv /tmp/plot-venv 2>&1; then | |
| /tmp/plot-venv/bin/pip install --quiet matplotlib numpy 2>&1 | |
| PYTHON=/tmp/plot-venv/bin/python3 | |
| else | |
| echo "venv failed, using PIP_BREAK_SYSTEM_PACKAGES fallback..." | |
| curl -sSL https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py | |
| PIP_BREAK_SYSTEM_PACKAGES=1 python3 /tmp/get-pip.py --user 2>&1 | |
| PIP_BREAK_SYSTEM_PACKAGES=1 python3 -m pip install --user matplotlib numpy 2>&1 | |
| PYTHON=python3 | |
| fi | |
| $PYTHON - <<'PLOTEOF' | |
| import json, os, sys | |
| try: | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import matplotlib.ticker as ticker | |
| import numpy as np | |
| except ImportError: | |
| print("matplotlib not available, skipping plot generation") | |
| sys.exit(0) | |
| PANEL_DIR = '/tmp/benchmark-panels' | |
| PREFILL_FILE = '/tmp/prefill-benchmark-results.json' | |
| os.makedirs(PANEL_DIR, exist_ok=True) | |
| if not os.path.exists(PREFILL_FILE): | |
| print("No prefill results found, skipping plots") | |
| sys.exit(0) | |
| with open(PREFILL_FILE) as f: | |
| results = json.load(f) | |
| if not isinstance(results, list) or len(results) < 2: | |
| print("Need at least 2 results (HPA + WVA) for comparison plots") | |
| sys.exit(0) | |
| hpa = next((r for r in results if r['autoscaler_type'] == 'HPA'), None) | |
| wva = next((r for r in results if r['autoscaler_type'] == 'WVA'), None) | |
| if not hpa or not wva: | |
| print("Missing HPA or WVA results") | |
| sys.exit(0) | |
| plt.rcParams.update({ | |
| 'figure.facecolor': 'white', 'axes.facecolor': '#f8f9fa', | |
| 'axes.grid': True, 'grid.alpha': 0.3, 'font.size': 12, | |
| 'axes.titlesize': 14, 'axes.labelsize': 12, 'figure.dpi': 150, | |
| }) | |
| HPA_C, WVA_C = '#e74c3c', '#2ecc71' | |
| EMPTY_METRIC = {'mean': 0, 'count': 0, 'percentiles': {k: 0 for k in ['p05','p10','p25','p50','p75','p90','p95','p99']}} | |
| def m(data, key): | |
| """Safely get a metric dict, returning EMPTY_METRIC if absent.""" | |
| v = data.get(key) | |
| if isinstance(v, dict): | |
| if 'percentiles' not in v: | |
| v['percentiles'] = EMPTY_METRIC['percentiles'] | |
| return v | |
| return EMPTY_METRIC | |
| def bar_pair(ax, hv, wv, title, ylabel, fmt='.1f'): | |
| bars = ax.bar(['HPA', 'WVA'], [hv, wv], color=[HPA_C, WVA_C], width=0.5, edgecolor='white', linewidth=1.5) | |
| ax.set_title(title, fontweight='bold') | |
| ax.set_ylabel(ylabel) | |
| for bar, val in zip(bars, [hv, wv]): | |
| ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + ax.get_ylim()[1]*0.02, | |
| f'{val:{fmt}}', ha='center', va='bottom', fontweight='bold', fontsize=11) | |
| # 1. Comparison bars (3x3 grid) | |
| fig, axes = plt.subplots(3, 3, figsize=(20, 13)) | |
| fig.suptitle('HPA vs WVA — Prefill-Heavy Workload (OpenShift)', fontsize=16, fontweight='bold', y=1.02) | |
| bar_pair(axes[0,0], m(hpa,'throughput')['mean'], m(wva,'throughput')['mean'], 'Mean Throughput', 'tokens/sec', '.0f') | |
| bar_pair(axes[0,1], m(hpa,'ttft')['count'], m(wva,'ttft')['count'], 'Completed Requests', 'count', 'd') | |
| bar_pair(axes[0,2], hpa['max_replicas'], wva['max_replicas'], 'Max Replicas', 'replicas', 'd') | |
| bar_pair(axes[1,0], hpa['avg_kv_cache'], wva['avg_kv_cache'], 'Avg KV Cache', 'utilization', '.3f') | |
| bar_pair(axes[1,1], hpa['avg_queue_depth'], wva['avg_queue_depth'], 'Avg vLLM Queue', 'requests', '.0f') | |
| bar_pair(axes[1,2], hpa.get('avg_epp_queue_depth',0), wva.get('avg_epp_queue_depth',0), 'Avg EPP Queue', 'requests', '.0f') | |
| bar_pair(axes[2,0], m(hpa,'itl')['mean'], m(wva,'itl')['mean'], 'Mean ITL', 'ms', '.2f') | |
| bar_pair(axes[2,1], m(hpa,'ttft')['mean']/1000, m(wva,'ttft')['mean']/1000, 'Mean TTFT', 'seconds', '.1f') | |
| bar_pair(axes[2,2], hpa['avg_replicas'], wva['avg_replicas'], 'Avg Replicas', 'replicas', '.1f') | |
| fig.tight_layout() | |
| fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-comparison.png'), bbox_inches='tight', dpi=150) | |
| plt.close() | |
| # 2. Replica timeline overlay | |
| fig, ax = plt.subplots(figsize=(14, 5)) | |
| for data, label, color in [(hpa, 'HPA', HPA_C), (wva, 'WVA', WVA_C)]: | |
| tl = data['replica_timeline'] | |
| times = [s['elapsed_sec'] for s in tl] | |
| ready = [s['ready_replicas'] for s in tl] | |
| ax.step(times, ready, where='post', label=f'{label} (ready)', color=color, linewidth=2.5) | |
| ax.fill_between(times, ready, step='post', alpha=0.1, color=color) | |
| ax.set_title('Ready Replicas Over Time — HPA vs WVA', fontsize=14, fontweight='bold') | |
| ax.set_xlabel('Time (seconds)') | |
| ax.set_ylabel('Ready Replicas') | |
| ax.legend(fontsize=12) | |
| ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True)) | |
| ax.set_ylim(0, max(wva['max_replicas'], hpa['max_replicas']) + 1) | |
| fig.tight_layout() | |
| fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-replica-timeline.png'), bbox_inches='tight', dpi=150) | |
| plt.close() | |
| # 3. Queue depth (vLLM + EPP) + KV cache over time | |
| fig, (ax_qd, ax_epp, ax_kv) = plt.subplots(3, 1, figsize=(14, 11), sharex=True) | |
| fig.suptitle('Queue Depth & KV Cache Over Time — HPA vs WVA', fontsize=15, fontweight='bold') | |
| for data, label, color in [(hpa, 'HPA', HPA_C), (wva, 'WVA', WVA_C)]: | |
| mt = data.get('metrics_timeline', []) | |
| if mt: | |
| times = [s['elapsed_sec'] for s in mt] | |
| qd = [s['queue_depth'] for s in mt] | |
| epp_qd = [s.get('epp_queue_depth', 0) for s in mt] | |
| kv = [s['kv_cache'] for s in mt] | |
| ax_qd.plot(times, qd, label=label, color=color, linewidth=2, alpha=0.85) | |
| ax_qd.fill_between(times, qd, alpha=0.1, color=color) | |
| ax_epp.plot(times, epp_qd, label=label, color=color, linewidth=2, alpha=0.85) | |
| ax_epp.fill_between(times, epp_qd, alpha=0.1, color=color) | |
| ax_kv.plot(times, kv, label=label, color=color, linewidth=2, alpha=0.85) | |
| ax_kv.fill_between(times, kv, alpha=0.1, color=color) | |
| ax_qd.set_title('vLLM Queue Depth (vllm:num_requests_waiting)', fontweight='bold') | |
| ax_qd.set_ylabel('Waiting Requests') | |
| ax_qd.legend(fontsize=11) | |
| ax_epp.set_title('EPP Queue Depth (inference_extension_flow_control_queue_size)', fontweight='bold') | |
| ax_epp.set_ylabel('Queued Requests') | |
| ax_epp.legend(fontsize=11) | |
| ax_kv.set_title('KV Cache Utilization (vllm:kv_cache_usage_perc)', fontweight='bold') | |
| ax_kv.set_ylabel('Utilization') | |
| ax_kv.set_xlabel('Time (seconds)') | |
| ax_kv.legend(fontsize=11) | |
| fig.tight_layout() | |
| fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-metrics-timeline.png'), bbox_inches='tight', dpi=150) | |
| plt.close() | |
| # 4. Throughput percentile distribution | |
| pct_keys = ['p05', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99'] | |
| pct_labels = ['p5', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99'] | |
| x = np.arange(len(pct_labels)) | |
| w = 0.35 | |
| fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5)) | |
| fig.suptitle('Percentile Distributions — HPA vs WVA', fontsize=15, fontweight='bold') | |
| for ax, metric, unit, div in [(ax1,'ttft','seconds',1000),(ax2,'itl','ms',1),(ax3,'throughput','tok/s',1)]: | |
| hv = [m(hpa, metric)['percentiles'].get(k, 0)/div for k in pct_keys] | |
| wv = [m(wva, metric)['percentiles'].get(k, 0)/div for k in pct_keys] | |
| ax.bar(x - w/2, hv, w, label='HPA', color=HPA_C, alpha=0.85) | |
| ax.bar(x + w/2, wv, w, label='WVA', color=WVA_C, alpha=0.85) | |
| ax.set_xticks(x); ax.set_xticklabels(pct_labels, fontsize=9) | |
| ax.set_title(f'{metric.upper()} Percentiles', fontweight='bold') | |
| ax.set_ylabel(unit); ax.legend(fontsize=9) | |
| fig.tight_layout() | |
| fig.savefig(os.path.join(PANEL_DIR, 'panel-prefill-percentiles.png'), bbox_inches='tight', dpi=150) | |
| plt.close() | |
| print(f"Generated benchmark plots in {PANEL_DIR}") | |
| for f in sorted(os.listdir(PANEL_DIR)): | |
| if f.endswith('.png'): | |
| print(f" {f}") | |
| # --- Generate per-autoscaler PDF reports (colleague format) --- | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| import textwrap | |
| model_id = os.environ.get('MODEL_ID', 'unknown') | |
| model_short = model_id.split('/')[-1] | |
| for data in [hpa, wva]: | |
| atype = data['autoscaler_type'] | |
| pdf_path = os.path.join(PANEL_DIR, f'report_{atype.lower()}_{model_short}.pdf') | |
| with PdfPages(pdf_path) as pdf: | |
| # Page 1: Configuration & Summary | |
| fig, ax = plt.subplots(figsize=(11, 8.5)) | |
| ax.axis('off') | |
| lines = [] | |
| sep = '='*80 | |
| dash = '-'*80 | |
| va_cfg = data.get('va_config', 'N/A') | |
| hpa_cfg = data.get('hpa_config', 'N/A') | |
| data_model = data.get('model_id', model_id) | |
| if atype == 'WVA': | |
| atype_label = 'Workload Variant Autoscaler (WVA)' | |
| else: | |
| atype_label = 'HPA Baseline (VA-constrained + HPA)' | |
| lines.append(sep) | |
| lines.append(f"AUTOSCALER TYPE : {atype_label}") | |
| lines.append(f"MODEL : {data_model}") | |
| lines.append(sep) | |
| lines.append(f"Autoscaler Configuration") | |
| lines.append(dash) | |
| lines.append(f" Variant (VA) : {va_cfg}") | |
| lines.append(f" HPA : {hpa_cfg}") | |
| lines.append(sep) | |
| lines.append(f"Benchmark Load Generator Configuration") | |
| lines.append(dash) | |
| lines.append(f" Profile : poisson @ 20 req/s") | |
| lines.append(f" Prompt tokens : 4000 | Output tokens: 1000") | |
| lines.append(f" Max seconds : 600 | Seed: 42") | |
| lines.append(sep) | |
| lines.append(f"EPP Configuration") | |
| lines.append(dash) | |
| lines.append(f" Flow Control : ENABLED") | |
| lines.append(f" Scorers : queue-scorer=2, kv-cache-utilization-scorer=2, prefix-cache-scorer=3") | |
| lines.append(sep) | |
| lines.append(f"Results Summary") | |
| lines.append(dash) | |
| tp_obj = data.get('throughput', {}) | |
| ttft_obj = data.get('ttft', {}) | |
| itl_obj = data.get('itl', {}) | |
| tp_mean = tp_obj.get('mean', 0) if isinstance(tp_obj, dict) else 0 | |
| ttft_mean = ttft_obj.get('mean', 0) if isinstance(ttft_obj, dict) else 0 | |
| ttft_p50 = ttft_obj.get('percentiles', {}).get('p50', 0) if isinstance(ttft_obj, dict) else 0 | |
| ttft_p99 = ttft_obj.get('percentiles', {}).get('p99', 0) if isinstance(ttft_obj, dict) else 0 | |
| itl_mean = itl_obj.get('mean', 0) if isinstance(itl_obj, dict) else 0 | |
| itl_p50 = itl_obj.get('percentiles', {}).get('p50', 0) if isinstance(itl_obj, dict) else 0 | |
| itl_p99 = itl_obj.get('percentiles', {}).get('p99', 0) if isinstance(itl_obj, dict) else 0 | |
| completed = ttft_obj.get('count', 0) if isinstance(ttft_obj, dict) else 0 | |
| error_count = data.get('error_count', 0) | |
| incomplete_count = data.get('incomplete_count', 0) | |
| achieved_rps = data.get('achieved_rps', 0) | |
| lines.append(f" Completed Requests : {completed}") | |
| lines.append(f" Failed Requests : {error_count}") | |
| lines.append(f" Incomplete Requests : {incomplete_count}") | |
| lines.append(f" Achieved RPS : {achieved_rps:.2f}") | |
| lines.append(f" Throughput (mean) : {tp_mean:.1f} tok/s") | |
| lines.append(f" Max Replicas : {data['max_replicas']}") | |
| lines.append(f" Avg Replicas : {data['avg_replicas']:.2f}") | |
| lines.append(f" Avg vLLM Queue : {data['avg_queue_depth']:.1f}") | |
| lines.append(f" Avg EPP Queue : {data.get('avg_epp_queue_depth', 0):.1f}") | |
| lines.append(f" Avg KV Cache : {data['avg_kv_cache']*100:.2f}%") | |
| lines.append(dash) | |
| lines.append(f" TTFT mean={ttft_mean/1000:.2f}s p50={ttft_p50/1000:.2f}s p99={ttft_p99/1000:.2f}s") | |
| lines.append(f" ITL mean={itl_mean:.2f}ms p50={itl_p50:.2f}ms p99={itl_p99:.2f}ms") | |
| lines.append(f" Duration: {data['duration_sec']:.0f}s") | |
| lines.append(sep) | |
| ax.text(0.05, 0.95, '\n'.join(lines), transform=ax.transAxes, fontsize=8.5, | |
| verticalalignment='top', fontfamily='monospace', | |
| bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8)) | |
| fig.suptitle(f'{atype} Benchmark Report — {model_short}', fontsize=14, fontweight='bold') | |
| pdf.savefig(fig, bbox_inches='tight') | |
| plt.close() | |
| # Page 2: Time-series charts (KV Cache, Queue, Replicas, EPP Queue) | |
| mt = data.get('metrics_timeline', []) | |
| tl = data.get('replica_timeline', []) | |
| if mt and tl: | |
| fig, axes = plt.subplots(4, 1, figsize=(11, 14), sharex=True) | |
| fig.suptitle(f'{atype} — Metrics Over Time ({model_short})', fontsize=14, fontweight='bold') | |
| color = WVA_C if atype == 'WVA' else HPA_C | |
| times_m = [s['elapsed_sec'] for s in mt] | |
| kv = [s['kv_cache']*100 for s in mt] | |
| qd = [s['queue_depth'] for s in mt] | |
| epp = [s.get('epp_queue_depth', 0) for s in mt] | |
| times_r = [s['elapsed_sec'] for s in tl] | |
| ready = [s['ready_replicas'] for s in tl] | |
| axes[0].plot(times_m, kv, color=color, linewidth=2) | |
| axes[0].fill_between(times_m, kv, alpha=0.15, color=color) | |
| axes[0].set_ylabel('KV Cache Usage (%)') | |
| axes[0].set_title('KV Cache Usage Over Time') | |
| axes[1].plot(times_m, qd, color=color, linewidth=2) | |
| axes[1].fill_between(times_m, qd, alpha=0.15, color=color) | |
| axes[1].set_ylabel('Requests Waiting') | |
| axes[1].set_title('Number of Requests Waiting Over Time') | |
| axes[2].step(times_r, ready, where='post', color=color, linewidth=2.5, label='Actual Replicas') | |
| axes[2].fill_between(times_r, ready, step='post', alpha=0.1, color=color) | |
| ax2b = axes[2].twinx() | |
| ax2b.plot(times_m, epp, color='#3498db', linewidth=1.5, alpha=0.7, label='EPP Queue') | |
| ax2b.set_ylabel('EPP Queue Size', color='#3498db') | |
| axes[2].set_ylabel('Replica Count') | |
| axes[2].set_title('Replica Count & EPP Queue Over Time') | |
| axes[2].yaxis.set_major_locator(ticker.MaxNLocator(integer=True)) | |
| axes[2].legend(loc='upper left', fontsize=9) | |
| ax2b.legend(loc='upper right', fontsize=9) | |
| axes[3].plot(times_m, epp, color='#3498db', linewidth=2) | |
| axes[3].fill_between(times_m, epp, alpha=0.15, color='#3498db') | |
| axes[3].set_ylabel('EPP Queue Size') | |
| axes[3].set_xlabel('Time (seconds)') | |
| axes[3].set_title('EPP Flow Control Queue Size Over Time') | |
| fig.tight_layout() | |
| pdf.savefig(fig, bbox_inches='tight') | |
| plt.close() | |
| # Page 3: GuideLLM metrics (TTFT, ITL, Throughput distributions) | |
| has_pcts = isinstance(ttft_obj, dict) and 'percentiles' in ttft_obj | |
| if has_pcts: | |
| fig, axes = plt.subplots(2, 2, figsize=(11, 8.5)) | |
| fig.suptitle(f'{atype} — GuideLLM Latency & Throughput ({model_short})', fontsize=14, fontweight='bold') | |
| pct_keys = ['p05','p10','p25','p50','p75','p90','p95','p99'] | |
| pct_labels = ['p5','p10','p25','p50','p75','p90','p95','p99'] | |
| x = np.arange(len(pct_labels)) | |
| ttft_vals = [ttft_obj.get('percentiles', {}).get(k, 0)/1000 for k in pct_keys] | |
| axes[0,0].bar(x, ttft_vals, color=color, alpha=0.85) | |
| axes[0,0].set_xticks(x); axes[0,0].set_xticklabels(pct_labels, fontsize=8) | |
| axes[0,0].set_title('TTFT Percentiles', fontweight='bold') | |
| axes[0,0].set_ylabel('seconds') | |
| axes[0,0].axhline(y=ttft_mean/1000, color='red', linestyle='--', label=f'mean={ttft_mean/1000:.1f}s') | |
| axes[0,0].legend(fontsize=8) | |
| itl_vals = [itl_obj.get('percentiles', {}).get(k, 0) for k in pct_keys] | |
| axes[0,1].bar(x, itl_vals, color=color, alpha=0.85) | |
| axes[0,1].set_xticks(x); axes[0,1].set_xticklabels(pct_labels, fontsize=8) | |
| axes[0,1].set_title('ITL Percentiles', fontweight='bold') | |
| axes[0,1].set_ylabel('ms') | |
| axes[0,1].axhline(y=itl_mean, color='red', linestyle='--', label=f'mean={itl_mean:.2f}ms') | |
| axes[0,1].legend(fontsize=8) | |
| tp_vals = [tp_obj.get('percentiles', {}).get(k, 0) for k in pct_keys] | |
| axes[1,0].bar(x, tp_vals, color=color, alpha=0.85) | |
| axes[1,0].set_xticks(x); axes[1,0].set_xticklabels(pct_labels, fontsize=8) | |
| axes[1,0].set_title('Throughput Percentiles', fontweight='bold') | |
| axes[1,0].set_ylabel('tok/s') | |
| axes[1,0].axhline(y=tp_mean, color='red', linestyle='--', label=f'mean={tp_mean:.0f}') | |
| axes[1,0].legend(fontsize=8) | |
| axes[1,1].axis('off') | |
| summary_lines = [ | |
| f"Completed : {completed}", | |
| f"Failed : {error_count}", | |
| f"Incomplete: {incomplete_count}", | |
| f"RPS : {achieved_rps:.2f}", | |
| f"", | |
| f"Throughput: {tp_mean:.0f} tok/s", | |
| f"TTFT mean : {ttft_mean/1000:.2f}s", | |
| f"ITL mean : {itl_mean:.2f}ms", | |
| f"", | |
| f"Avg Replicas: {data['avg_replicas']:.2f}", | |
| f"Max Replicas: {data['max_replicas']}", | |
| f"Avg KV Cache: {data['avg_kv_cache']*100:.2f}%", | |
| ] | |
| axes[1,1].text(0.1, 0.85, '\n'.join(summary_lines), transform=axes[1,1].transAxes, | |
| fontsize=11, verticalalignment='top', fontfamily='monospace', | |
| bbox=dict(boxstyle='round', facecolor='#f0f0f0', alpha=0.8)) | |
| axes[1,1].set_title('Summary', fontweight='bold') | |
| fig.tight_layout() | |
| pdf.savefig(fig, bbox_inches='tight') | |
| plt.close() | |
| print(f" Generated PDF report: {pdf_path}") | |
| print(f"Generated benchmark plots in {PANEL_DIR}") | |
| for f in sorted(os.listdir(PANEL_DIR)): | |
| if f.endswith('.png') or f.endswith('.pdf'): | |
| print(f" {f}") | |
| PLOTEOF | |
| - name: Upload benchmark results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results-openshift | |
| path: | | |
| /tmp/benchmark-results.json | |
| /tmp/prefill-benchmark-results.json | |
| /tmp/benchmark-grafana-snapshot.txt | |
| /tmp/benchmark-grafana-snapshot.json | |
| /tmp/benchmark-panels/ | |
| if-no-files-found: warn | |
| - name: Post benchmark results as PR comment | |
| if: always() && (github.event_name == 'issue_comment' || needs.gate.outputs.pr_number != '') | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}'); | |
| const sha = '${{ needs.gate.outputs.pr_head_sha }}'; | |
| const runId = context.runId; | |
| const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`; | |
| let artifactUrl = `${repoUrl}/actions/runs/${runId}`; | |
| try { | |
| const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| run_id: runId | |
| }); | |
| const benchArtifact = artifacts.find(a => a.name === 'benchmark-results-openshift'); | |
| if (benchArtifact) { | |
| artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`; | |
| } | |
| } catch (e) { | |
| console.log(`Could not look up artifact: ${e.message}`); | |
| } | |
| let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.'; | |
| try { | |
| const data = JSON.parse(fs.readFileSync('/tmp/benchmark-results.json', 'utf8')); | |
| const fmtTime = (v) => v < 0 ? 'N/A' : `${v.toFixed(1)}s`; | |
| resultsTable = `| Metric | Value | | |
| |--------|-------| | |
| | Scale-up time | ${fmtTime(data.scaleUpTimeSec)} | | |
| | Scale-down time | ${fmtTime(data.scaleDownTimeSec)} | | |
| | Max replicas | ${data.maxReplicas} | | |
| | Avg KV cache usage | ${data.avgKVCacheUsage.toFixed(3)} | | |
| | Avg queue depth | ${data.avgQueueDepth.toFixed(1)} | | |
| | Replica oscillation (σ) | ${data.replicaOscillation.toFixed(2)} | | |
| | Total duration | ${data.totalDurationSec.toFixed(0)}s |`; | |
| } catch (e) { | |
| console.log(`Could not read results: ${e.message}`); | |
| } | |
| let prefillSection = ''; | |
| try { | |
| const prefillData = JSON.parse(fs.readFileSync('/tmp/prefill-benchmark-results.json', 'utf8')); | |
| if (Array.isArray(prefillData) && prefillData.length >= 2) { | |
| const hpa = prefillData.find(r => r.autoscaler_type === 'HPA'); | |
| const wva = prefillData.find(r => r.autoscaler_type === 'WVA'); | |
| if (hpa && wva) { | |
| const delta = (h, w, lowerBetter) => { | |
| if (h === 0) return '—'; | |
| const pct = ((w - h) / Math.abs(h)) * 100; | |
| const arrow = pct < 0 ? '↓' : '↑'; | |
| const sign = pct > 0 ? '+' : ''; | |
| return `${sign}${pct.toFixed(1)}% ${arrow}`; | |
| }; | |
| const fmtP = (obj, key, div=1) => obj && obj.percentiles ? (obj.percentiles[key]/div).toFixed(1) : 'N/A'; | |
| const fmtM = (obj, div=1, prec=1) => obj ? (obj.mean/div).toFixed(prec) : 'N/A'; | |
| let table = `| Metric | HPA (Baseline) | WVA | Δ | | |
| |--------|---------------|-----|---| | |
| | **Max Replicas** | ${hpa.max_replicas} | **${wva.max_replicas}** | ${delta(hpa.max_replicas, wva.max_replicas)} | | |
| | **Avg Replicas** | ${hpa.avg_replicas.toFixed(2)} | **${wva.avg_replicas.toFixed(2)}** | ${delta(hpa.avg_replicas, wva.avg_replicas)} | | |
| | **Avg vLLM Queue Depth** | ${hpa.avg_queue_depth.toFixed(1)} | **${wva.avg_queue_depth.toFixed(1)}** | ${delta(hpa.avg_queue_depth, wva.avg_queue_depth)} | | |
| | **Avg EPP Queue Depth** | ${(hpa.avg_epp_queue_depth||0).toFixed(1)} | **${(wva.avg_epp_queue_depth||0).toFixed(1)}** | ${delta(hpa.avg_epp_queue_depth||0, wva.avg_epp_queue_depth||0)} | | |
| | **Avg KV Cache** | ${hpa.avg_kv_cache.toFixed(3)} | ${wva.avg_kv_cache.toFixed(3)} | ${delta(hpa.avg_kv_cache, wva.avg_kv_cache)} | | |
| | **TTFT mean** | ${fmtM(hpa.ttft, 1000)}s | **${fmtM(wva.ttft, 1000)}s** | ${hpa.ttft && wva.ttft ? delta(hpa.ttft.mean, wva.ttft.mean) : '—'} | | |
| | **TTFT p50** | ${fmtP(hpa.ttft, 'p50', 1000)}s | **${fmtP(wva.ttft, 'p50', 1000)}s** | — | | |
| | **TTFT p99** | ${fmtP(hpa.ttft, 'p99', 1000)}s | **${fmtP(wva.ttft, 'p99', 1000)}s** | — | | |
| | **ITL mean** | ${fmtM(hpa.itl, 1, 2)}ms | **${fmtM(wva.itl, 1, 2)}ms** | ${hpa.itl && wva.itl ? delta(hpa.itl.mean, wva.itl.mean) : '—'} | | |
| | **Throughput mean** | ${fmtM(hpa.throughput)}tok/s | **${fmtM(wva.throughput)}tok/s** | ${hpa.throughput && wva.throughput ? delta(hpa.throughput.mean, wva.throughput.mean) : '—'} | | |
| | **Throughput p50** | ${fmtP(hpa.throughput, 'p50')}tok/s | **${fmtP(wva.throughput, 'p50')}tok/s** | — | | |
| | **Completed Requests** | ${hpa.ttft ? hpa.ttft.count : 'N/A'} | **${wva.ttft ? wva.ttft.count : 'N/A'}** | ${hpa.ttft && wva.ttft ? delta(hpa.ttft.count, wva.ttft.count) : '—'} | | |
| | **Failed Requests** | ${hpa.error_count || 0} | ${wva.error_count || 0} | — | | |
| | **Incomplete Requests** | ${hpa.incomplete_count || 0} | ${wva.incomplete_count || 0} | — | | |
| | **Achieved RPS** | ${(hpa.achieved_rps || 0).toFixed(2)} | ${(wva.achieved_rps || 0).toFixed(2)} | — | | |
| | **Duration** | ${hpa.duration_sec.toFixed(0)}s | ${wva.duration_sec.toFixed(0)}s | — |`; | |
| let timelines = ''; | |
| for (const r of [hpa, wva]) { | |
| if (r.replica_timeline && r.replica_timeline.length > 0) { | |
| timelines += `\n<details>\n<summary>${r.autoscaler_type} Replica Timeline (${r.replica_timeline.length} snapshots)</summary>\n\n| Time (s) | Spec | Ready |\n|----------|------|-------|\n`; | |
| for (const s of r.replica_timeline) { | |
| timelines += `| ${s.elapsed_sec.toFixed(0)} | ${s.spec_replicas} | ${s.ready_replicas} |\n`; | |
| } | |
| timelines += `\n</details>\n`; | |
| } | |
| } | |
| prefillSection = `\n\n---\n\n## Benchmark: prefill-heavy-workload (OpenShift)\n\n${table}\n${timelines}`; | |
| } | |
| } else if (Array.isArray(prefillData) && prefillData.length > 0) { | |
| let rows = ''; | |
| for (const r of prefillData) { | |
| rows += `\n### ${r.autoscaler_type}\n\n| Metric | Value |\n|--------|-------|\n| Duration | ${r.duration_sec.toFixed(0)}s |\n| Max Replicas | ${r.max_replicas} |\n| Avg Replicas | ${r.avg_replicas.toFixed(2)} |\n| Avg vLLM Queue Depth | ${r.avg_queue_depth.toFixed(2)} |\n| Avg EPP Queue Depth | ${(r.avg_epp_queue_depth||0).toFixed(2)} |\n| Avg KV Cache | ${r.avg_kv_cache.toFixed(3)} |\n`; | |
| } | |
| prefillSection = `\n\n---\n\n## Benchmark: prefill-heavy-workload (OpenShift)\n${rows}`; | |
| } | |
| } catch (e) { | |
| console.log(`Could not read prefill results: ${e.message}`); | |
| } | |
| let panelImages = ''; | |
| const panelDir = '/tmp/benchmark-panels'; | |
| const hasPanels = fs.existsSync(panelDir) && fs.readdirSync(panelDir).some(f => f.endsWith('.png')); | |
| if (hasPanels) { | |
| const pngs = fs.readdirSync(panelDir).filter(f => f.endsWith('.png')).sort(); | |
| const tag = `benchmark-run-os-${runId}`; | |
| try { | |
| const release = await github.rest.repos.createRelease({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| tag_name: tag, | |
| name: `Benchmark panels OpenShift (PR #${prNumber}, ${sha.substring(0, 7)})`, | |
| body: `Auto-generated by benchmark CI run #${runId}`, | |
| draft: false, | |
| prerelease: true | |
| }); | |
| const imageUrls = []; | |
| for (const png of pngs) { | |
| const filePath = path.join(panelDir, png); | |
| const fileData = fs.readFileSync(filePath); | |
| const asset = await github.rest.repos.uploadReleaseAsset({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| release_id: release.data.id, | |
| name: png, | |
| data: fileData, | |
| headers: { 'content-type': 'image/png' } | |
| }); | |
| const title = png.replace('panel-', '').replace('.png', '').replace(/-/g, ' '); | |
| imageUrls.push(`#### ${title}\n`); | |
| } | |
| if (imageUrls.length > 0) { | |
| panelImages = `\n\n<details>\n<summary>Dashboard Panels (${imageUrls.length})</summary>\n\n${imageUrls.join('\n\n')}\n\n</details>`; | |
| } | |
| } catch (e) { | |
| console.log(`Could not upload panel images: ${e.message}`); | |
| } | |
| } | |
| const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json'); | |
| let artifactsSection = ''; | |
| if (hasSnapshotJson || hasPanels) { | |
| const items = []; | |
| if (hasSnapshotJson) items.push('Grafana snapshot JSON'); | |
| artifactsSection = `\n\n📎 **[Download artifacts](${artifactUrl})**${items.length ? ' — ' + items.join(', ') : ''}`; | |
| } | |
| const body = `## Benchmark: scale-up-latency (OpenShift) | |
| ${resultsTable}${prefillSection}${panelImages}${artifactsSection} | |
| <details> | |
| <summary>Environment</summary> | |
| - Cluster: OpenShift (Real GPUs) | |
| - Model: ${process.env.MODEL_ID || 'unsloth/Meta-Llama-3.1-8B'} | |
| - Accelerator: H100 | |
| - Commit: ${sha.substring(0, 7)} | |
| - Scaler: prometheus-adapter | |
| - [Workflow run](${repoUrl}/actions/runs/${runId}) | |
| </details>`; | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: prNumber, | |
| body: body | |
| }); | |
| - name: Cleanup infrastructure | |
| if: always() | |
| run: | | |
| helm uninstall "$WVA_RELEASE_NAME" -n "$WVA_NAMESPACE" --ignore-not-found --wait --timeout 60s || true | |
| for release in $(helm list -n "$LLMD_NAMESPACE" -q 2>/dev/null); do | |
| helm uninstall "$release" -n "$LLMD_NAMESPACE" --ignore-not-found --wait --timeout 60s || true | |
| done | |
| kubectl delete namespace "$LLMD_NAMESPACE" --ignore-not-found --timeout=120s || true | |
| kubectl delete namespace "$WVA_NAMESPACE" --ignore-not-found --timeout=120s || true | |
| report-status: | |
| runs-on: ubuntu-latest | |
| needs: [gate, benchmark-kind, benchmark-openshift] | |
| if: always() && needs.gate.outputs.run_benchmark == 'true' | |
| permissions: | |
| statuses: write | |
| steps: | |
| - name: Report status to PR | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const prHeadSha = '${{ needs.gate.outputs.pr_head_sha }}'; | |
| const platform = '${{ needs.gate.outputs.platform }}'; | |
| let benchResult; | |
| if (platform === 'openshift') { | |
| benchResult = '${{ needs.benchmark-openshift.result }}'; | |
| } else { | |
| benchResult = '${{ needs.benchmark-kind.result }}'; | |
| } | |
| if (!prHeadSha) { | |
| console.log('No PR head SHA available, skipping status report'); | |
| return; | |
| } | |
| let state, description; | |
| if (benchResult === 'success') { | |
| state = 'success'; | |
| description = 'Benchmark completed successfully'; | |
| } else if (benchResult === 'skipped') { | |
| state = 'failure'; | |
| description = 'Benchmark did not run (prerequisite failed or skipped)'; | |
| } else if (benchResult === 'cancelled') { | |
| state = 'failure'; | |
| description = 'Benchmark cancelled'; | |
| } else { | |
| state = 'failure'; | |
| description = 'Benchmark failed'; | |
| } | |
| console.log(`Reporting status to PR commit ${prHeadSha}: ${state} - ${description}`); | |
| await github.rest.repos.createCommitStatus({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| sha: prHeadSha, | |
| state: state, | |
| target_url: `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`, | |
| description: description, | |
| context: '${{ github.workflow }} / benchmark-kind' | |
| }); | |
| console.log('Status reported successfully'); |