Skip to content

Commit 612f08f

Browse files
committed
Feat: Export Grafana snapshot JSON and render panels to PNG
Persist benchmark Grafana data to GitHub Actions artifacts so results survive the ephemeral Kind cluster: 1. Snapshot JSON export: fetch full snapshot via GET /api/snapshots/:key and save as re-importable JSON (POST to any Grafana to restore). 2. Panel PNG rendering: enable grafana-image-renderer plugin and render all 5 dashboard panels to individual PNG files. - grafana.go: CreateSnapshot now returns SnapshotResult (key+URL), add ExportSnapshotJSON and RenderAllPanels methods - config.go: add GrafanaSnapshotJSONFile and GrafanaPanelDir fields - benchmark-dashboard.json: add explicit panel IDs for stable rendering - benchmark-grafana-values.yaml: enable imageRenderer with resource limits - ci-benchmark.yaml: pass new env vars, upload JSON + PNGs as artifacts
1 parent 2c1b6ed commit 612f08f

10 files changed

Lines changed: 287 additions & 66 deletions

File tree

.github/workflows/ci-benchmark.yaml

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ jobs:
117117
permissions:
118118
contents: read
119119
statuses: write
120+
pull-requests: write
121+
actions: read
120122
steps:
121123
- name: Set pending status on PR head
122124
uses: actions/github-script@v7
@@ -193,6 +195,7 @@ jobs:
193195
USE_SIMULATOR: "true"
194196
CREATE_CLUSTER: "true"
195197
INSTALL_GATEWAY_CTRLPLANE: "true"
198+
E2E_TESTS_ENABLED: "true"
196199
IMG: ${{ steps.build-image.outputs.image }}
197200
SKIP_BUILD: "true"
198201
KV_SPARE_TRIGGER: "0.5"
@@ -207,6 +210,8 @@ jobs:
207210
BENCHMARK_RESULTS_FILE: /tmp/benchmark-results.json
208211
BENCHMARK_GRAFANA_ENABLED: "true"
209212
BENCHMARK_GRAFANA_SNAPSHOT_FILE: /tmp/benchmark-grafana-snapshot.txt
213+
BENCHMARK_GRAFANA_SNAPSHOT_JSON: /tmp/benchmark-grafana-snapshot.json
214+
BENCHMARK_GRAFANA_PANEL_DIR: /tmp/benchmark-panels
210215
KV_SPARE_TRIGGER: "0.5"
211216
QUEUE_SPARE_TRIGGER: "4.5"
212217
run: make test-benchmark
@@ -219,6 +224,8 @@ jobs:
219224
path: |
220225
/tmp/benchmark-results.json
221226
/tmp/benchmark-grafana-snapshot.txt
227+
/tmp/benchmark-grafana-snapshot.json
228+
/tmp/benchmark-panels/
222229
if-no-files-found: warn
223230

224231
- name: Post benchmark results as PR comment
@@ -229,6 +236,24 @@ jobs:
229236
const fs = require('fs');
230237
const prNumber = parseInt('${{ needs.gate.outputs.pr_number }}');
231238
const sha = '${{ needs.gate.outputs.pr_head_sha }}';
239+
const runId = context.runId;
240+
const repoUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}`;
241+
242+
// Look up the uploaded artifact to get a direct download link
243+
let artifactUrl = `${repoUrl}/actions/runs/${runId}`;
244+
try {
245+
const { data: { artifacts } } = await github.rest.actions.listWorkflowRunArtifacts({
246+
owner: context.repo.owner,
247+
repo: context.repo.repo,
248+
run_id: runId
249+
});
250+
const benchArtifact = artifacts.find(a => a.name === 'benchmark-results');
251+
if (benchArtifact) {
252+
artifactUrl = `${repoUrl}/actions/runs/${runId}/artifacts/${benchArtifact.id}`;
253+
}
254+
} catch (e) {
255+
console.log(`Could not look up artifact: ${e.message}`);
256+
}
232257
233258
let resultsTable = '⚠️ Benchmark results file not found or could not be parsed.';
234259
@@ -246,17 +271,31 @@ jobs:
246271
| Avg queue depth | ${data.avgQueueDepth.toFixed(1)} |
247272
| Replica oscillation (σ) | ${data.replicaOscillation.toFixed(2)} |
248273
| Total duration | ${data.totalDurationSec.toFixed(0)}s |`;
249-
250-
if (data.grafanaSnapshotUrl) {
251-
resultsTable += `\n| Grafana snapshot | [View dashboard](${data.grafanaSnapshotUrl}) |`;
252-
}
253274
} catch (e) {
254275
console.log(`Could not read results: ${e.message}`);
255276
}
256277
278+
// Check which Grafana artifacts exist
279+
const hasSnapshotJson = fs.existsSync('/tmp/benchmark-grafana-snapshot.json');
280+
const hasPanels = fs.existsSync('/tmp/benchmark-panels') &&
281+
fs.readdirSync('/tmp/benchmark-panels').some(f => f.endsWith('.png'));
282+
283+
let artifactsSection = '';
284+
if (hasSnapshotJson || hasPanels) {
285+
const items = [];
286+
if (hasSnapshotJson) {
287+
items.push('Grafana snapshot JSON (re-import via `POST /api/snapshots`)');
288+
}
289+
if (hasPanels) {
290+
const pngs = fs.readdirSync('/tmp/benchmark-panels').filter(f => f.endsWith('.png'));
291+
items.push(`${pngs.length} dashboard panel PNGs`);
292+
}
293+
artifactsSection = `\n\n📎 **[Download artifacts](${artifactUrl})** — ${items.join(', ')}`;
294+
}
295+
257296
const body = `## Benchmark: scale-up-latency (Kind)
258297
259-
${resultsTable}
298+
${resultsTable}${artifactsSection}
260299
261300
<details>
262301
<summary>Environment</summary>
@@ -265,6 +304,7 @@ jobs:
265304
- Model: unsloth/Meta-Llama-3.1-8B (simulator)
266305
- Commit: ${sha.substring(0, 7)}
267306
- Scaler: prometheus-adapter
307+
- [Workflow run](${repoUrl}/actions/runs/${runId})
268308
269309
</details>`;
270310

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ test-benchmark: manifests generate fmt vet ## Run benchmark tests (scale-up-late
283283
USE_SIMULATOR=$(USE_SIMULATOR) \
284284
SCALER_BACKEND=$(SCALER_BACKEND) \
285285
MODEL_ID=$(MODEL_ID) \
286-
go test ./test/benchmark/ -timeout 25m -v -ginkgo.v \
286+
go test ./test/benchmark/ -timeout 30m -v -ginkgo.v \
287287
-ginkgo.label-filter="benchmark"; \
288288
TEST_EXIT_CODE=$$?; \
289289
echo ""; \

deploy/grafana/benchmark-dashboard.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"links": [],
2020
"panels": [
2121
{
22+
"id": 1,
2223
"title": "Deployment Replicas",
2324
"type": "timeseries",
2425
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
@@ -54,6 +55,7 @@
5455
"options": { "legend": { "displayMode": "list", "placement": "bottom" } }
5556
},
5657
{
58+
"id": 2,
5759
"title": "WVA Desired Replicas",
5860
"type": "timeseries",
5961
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
@@ -82,6 +84,7 @@
8284
"options": { "legend": { "displayMode": "list", "placement": "bottom" } }
8385
},
8486
{
87+
"id": 3,
8588
"title": "KV Cache Usage",
8689
"type": "timeseries",
8790
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
@@ -124,6 +127,7 @@
124127
"options": { "legend": { "displayMode": "list", "placement": "bottom" } }
125128
},
126129
{
130+
"id": 4,
127131
"title": "Queue Depth (Requests Waiting)",
128132
"type": "timeseries",
129133
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
@@ -157,6 +161,7 @@
157161
"options": { "legend": { "displayMode": "list", "placement": "bottom" } }
158162
},
159163
{
164+
"id": 5,
160165
"title": "Saturation Metrics",
161166
"type": "timeseries",
162167
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },

deploy/grafana/benchmark-grafana-values.yaml

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@ grafana:
55
enabled: true
66
adminUser: admin
77
adminPassword: admin
8-
service:
9-
type: ClusterIP
10-
port: 3000
118
# Allow anonymous access so the benchmark can use the API without auth tokens
129
grafana.ini:
1310
auth.anonymous:
@@ -17,6 +14,19 @@ grafana:
1714
allow_embedding: true
1815
snapshots:
1916
external_enabled: false
17+
rendering:
18+
server_url: http://kube-prometheus-stack-grafana-image-renderer:8081/render
19+
callback_url: http://kube-prometheus-stack-grafana:80/
20+
# Image renderer for PNG panel exports (separate Deployment — best-effort)
21+
imageRenderer:
22+
enabled: true
23+
resources:
24+
requests:
25+
cpu: 50m
26+
memory: 100Mi
27+
limits:
28+
cpu: 200m
29+
memory: 256Mi
2030
# Auto-provision Prometheus as a datasource
2131
additionalDataSources:
2232
- name: Prometheus
@@ -37,7 +47,7 @@ grafana:
3747
resources:
3848
requests:
3949
cpu: 100m
40-
memory: 128Mi
41-
limits:
42-
cpu: 300m
4350
memory: 256Mi
51+
limits:
52+
cpu: 500m
53+
memory: 512Mi

test/benchmark/benchmark_test.go

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -129,32 +129,72 @@ var _ = Describe("Scale-Up Latency Benchmark", Label("benchmark"), Ordered, func
129129
Name: vaName,
130130
}, currentVA)
131131
g.Expect(err).NotTo(HaveOccurred())
132-
optimized := int32(currentVA.Status.DesiredOptimizedAlloc.NumReplicas)
133-
g.Expect(optimized).To(BeNumerically(">=", 1), "VA should have optimized >= 1")
132+
g.Expect(currentVA.Status.DesiredOptimizedAlloc.NumReplicas).NotTo(BeNil(), "NumReplicas should be set")
133+
g.Expect(*currentVA.Status.DesiredOptimizedAlloc.NumReplicas).To(BeNumerically(">=", 1), "VA should have optimized >= 1")
134134
}, 5*time.Minute, 10*time.Second).Should(Succeed())
135135

136+
By("Verifying external metrics API serves wva_desired_replicas")
137+
Eventually(func(g Gomega) {
138+
result, err := k8sClient.RESTClient().
139+
Get().
140+
AbsPath("/apis/external.metrics.k8s.io/v1beta1/namespaces/" + benchCfg.LLMDNamespace + "/wva_desired_replicas").
141+
DoRaw(ctx)
142+
g.Expect(err).NotTo(HaveOccurred(), "External metrics API should be accessible")
143+
g.Expect(string(result)).To(ContainSubstring("wva_desired_replicas"), "Metric should be available")
144+
g.Expect(string(result)).To(ContainSubstring(vaName), "Metric should reference the benchmark VA")
145+
GinkgoWriter.Printf("External metrics API confirmed: wva_desired_replicas available for %s\n", vaName)
146+
}, 5*time.Minute, 10*time.Second).Should(Succeed())
147+
148+
By("Waiting for Prometheus to scrape simulator metrics")
149+
Eventually(func(g Gomega) {
150+
_, err := promClient.QueryWithRetry(ctx, `vllm:gpu_cache_usage_perc`)
151+
g.Expect(err).NotTo(HaveOccurred(), "Prometheus should have KV cache metrics from simulator")
152+
GinkgoWriter.Println("Prometheus confirmed: vllm:gpu_cache_usage_perc is available")
153+
}, 5*time.Minute, 15*time.Second).Should(Succeed())
154+
136155
scenarioStart = time.Now()
137-
GinkgoWriter.Println("BeforeAll completed — benchmark scenario starting")
156+
GinkgoWriter.Println("BeforeAll completed — metrics pipeline verified, benchmark scenario starting")
138157
})
139158

140159
AfterAll(func() {
141160
results.TotalDurationSec = time.Since(scenarioStart).Seconds()
142161

143162
if grafanaClient != nil && benchCfg.GrafanaEnabled {
144163
By("Capturing Grafana snapshot of benchmark dashboard")
145-
snapshotURL, snapErr := grafanaClient.CreateSnapshot(scenarioStart)
164+
snapResult, snapErr := grafanaClient.CreateSnapshot(scenarioStart)
146165
if snapErr != nil {
147166
GinkgoWriter.Printf("Warning: failed to create Grafana snapshot: %v\n", snapErr)
148167
} else {
149-
results.GrafanaSnapshotURL = snapshotURL
150-
GinkgoWriter.Printf("Grafana snapshot: %s\n", snapshotURL)
168+
results.GrafanaSnapshotURL = snapResult.URL
169+
GinkgoWriter.Printf("Grafana snapshot: %s\n", snapResult.URL)
151170

152171
if benchCfg.GrafanaSnapshotFile != "" {
153-
if writeErr := os.WriteFile(benchCfg.GrafanaSnapshotFile, []byte(snapshotURL+"\n"), 0644); writeErr != nil {
154-
GinkgoWriter.Printf("Warning: failed to write snapshot file: %v\n", writeErr)
172+
if writeErr := os.WriteFile(benchCfg.GrafanaSnapshotFile, []byte(snapResult.URL+"\n"), 0644); writeErr != nil {
173+
GinkgoWriter.Printf("Warning: failed to write snapshot URL file: %v\n", writeErr)
174+
}
175+
}
176+
177+
// Export full snapshot JSON for offline re-import
178+
if benchCfg.GrafanaSnapshotJSONFile != "" {
179+
By("Exporting Grafana snapshot JSON")
180+
if exportErr := grafanaClient.ExportSnapshotJSON(snapResult.Key, benchCfg.GrafanaSnapshotJSONFile); exportErr != nil {
181+
GinkgoWriter.Printf("Warning: failed to export snapshot JSON: %v\n", exportErr)
182+
} else {
183+
GinkgoWriter.Printf("Snapshot JSON exported to %s\n", benchCfg.GrafanaSnapshotJSONFile)
155184
}
156185
}
157186
}
187+
188+
// Render all panels to PNG
189+
if benchCfg.GrafanaPanelDir != "" {
190+
By("Rendering dashboard panels to PNG")
191+
panelFiles, renderErr := grafanaClient.RenderAllPanels(scenarioStart, time.Now(), benchCfg.GrafanaPanelDir)
192+
if renderErr != nil {
193+
GinkgoWriter.Printf("Warning: panel rendering failed: %v\n", renderErr)
194+
} else {
195+
GinkgoWriter.Printf("Rendered %d panels to %s\n", len(panelFiles), benchCfg.GrafanaPanelDir)
196+
}
197+
}
158198
}
159199

160200
By("Writing benchmark results to file")
@@ -234,7 +274,7 @@ var _ = Describe("Scale-Up Latency Benchmark", Label("benchmark"), Ordered, func
234274
}
235275
}
236276
g.Expect(runningCount).To(BeNumerically(">=", benchLoadWorkers))
237-
}, 3*time.Minute, 5*time.Second).Should(Succeed())
277+
}, 5*time.Minute, 5*time.Second).Should(Succeed())
238278

239279
By("Polling replicas to detect scale-up")
240280
scaleUpDetected := false

test/benchmark/config.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,10 @@ type BenchmarkConfig struct {
5151
BenchmarkResultsFile string
5252

5353
// Grafana
54-
GrafanaEnabled bool // Deploy ephemeral Grafana and capture snapshot
55-
GrafanaSnapshotFile string // Path to write snapshot URL
54+
GrafanaEnabled bool // Deploy ephemeral Grafana and capture snapshot
55+
GrafanaSnapshotFile string // Path to write snapshot URL
56+
GrafanaSnapshotJSONFile string // Path to export full snapshot JSON (re-importable)
57+
GrafanaPanelDir string // Directory to write rendered panel PNGs
5658

5759
// Phase durations (seconds, overridable via env for tuning)
5860
BaselineDurationSec int
@@ -102,8 +104,10 @@ func LoadConfigFromEnv() BenchmarkConfig {
102104

103105
BenchmarkResultsFile: getEnv("BENCHMARK_RESULTS_FILE", "/tmp/benchmark-results.json"),
104106

105-
GrafanaEnabled: getEnvBool("BENCHMARK_GRAFANA_ENABLED", true),
106-
GrafanaSnapshotFile: getEnv("BENCHMARK_GRAFANA_SNAPSHOT_FILE", "/tmp/benchmark-grafana-snapshot.txt"),
107+
GrafanaEnabled: getEnvBool("BENCHMARK_GRAFANA_ENABLED", true),
108+
GrafanaSnapshotFile: getEnv("BENCHMARK_GRAFANA_SNAPSHOT_FILE", "/tmp/benchmark-grafana-snapshot.txt"),
109+
GrafanaSnapshotJSONFile: getEnv("BENCHMARK_GRAFANA_SNAPSHOT_JSON", "/tmp/benchmark-grafana-snapshot.json"),
110+
GrafanaPanelDir: getEnv("BENCHMARK_GRAFANA_PANEL_DIR", "/tmp/benchmark-panels"),
107111

108112
BaselineDurationSec: getEnvInt("BENCHMARK_BASELINE_DURATION", 120),
109113
SpikeDurationSec: getEnvInt("BENCHMARK_SPIKE_DURATION", 300),

0 commit comments

Comments
 (0)