-
Notifications
You must be signed in to change notification settings - Fork 9k
452 lines (414 loc) · 17.9 KB
/
cold-start-benchmark.yml
File metadata and controls
452 lines (414 loc) · 17.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
name: Cold Start Benchmark
# Label-gated cold-start regression gate (Phase 1 / MEAS-08; D-14, D-16, D-17).
# Runs only when:
# 1. A PR carries the `run-benchmarks` label (verify mode: fail if regression exceeds threshold).
# 2. A PR carries the `run-benchmark-snapshot` label (snapshot mode: capture new authoritative baseline).
# 3. A commit is pushed to a branch matching `cold-start/**`.
# 4. A maintainer triggers the workflow manually via workflow_dispatch.
#
# Workflow structure: scenarios run in parallel matrix jobs so a single slow
# scenario cannot exhaust the 30-minute job budget for the whole pipeline.
# Each matrix job owns ONE scenario; a final aggregation job assembles results.
#
# Regression policy: any matrix job with a regression fails the workflow
# AND posts a bot comment on the PR with the numbers diff sourced from the
# scenario's reports/regression_comment.md.
on:
workflow_dispatch:
inputs:
mode:
description: "verify: run gate against thresholds.json. snapshot: capture new baseline."
required: false
default: verify
type: choice
options:
- verify
- snapshot
ref:
description: "Git ref to checkout (branch, tag, or commit SHA)."
required: false
default: ""
type: string
pull_request:
types: [opened, labeled, synchronize, reopened]
push:
branches:
- "cold-start/**"
permissions:
contents: read
pull-requests: write
jobs:
# One-shot job that decides mode once for the whole workflow and emits it
# as a job output. Matrix jobs read this instead of each re-evaluating the
# (messy) PR-label + inputs + event-name expression.
mode:
name: "Resolve mode"
if: >-
contains(github.event.pull_request.labels.*.name, 'run-benchmarks') ||
contains(github.event.pull_request.labels.*.name, 'run-benchmark-snapshot') ||
github.event_name == 'workflow_dispatch' ||
github.event_name == 'push'
runs-on: ubuntu-latest
timeout-minutes: 2
outputs:
mode: ${{ steps.resolve.outputs.mode }}
steps:
- id: resolve
run: |
# Mode selection precedence:
# 1. workflow_dispatch input (maintainer-specified)
# 2. run-benchmark-snapshot label on the PR (snapshot mode)
# 3. default: verify mode (regression gate)
if [ -n "${{ github.event.inputs.mode }}" ]; then
mode="${{ github.event.inputs.mode }}"
elif ${{ contains(github.event.pull_request.labels.*.name, 'run-benchmark-snapshot') }}; then
mode="snapshot"
else
mode="verify"
fi
echo "mode=$mode" >> "$GITHUB_OUTPUT"
echo "Resolved mode: $mode"
# Build benchmarks-lean AND benchmarks-lean-uncompiled once, save both as a
# tarball, upload as an artifact. Matrix jobs `docker load` it. Cuts wall-
# clock ~6 minutes and avoids the "--skip-build needs both images present"
# failure mode.
build-images:
name: "Build images"
needs: mode
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.ref || github.ref }}
- name: Build benchmarks-lean
run: |
docker build \
--build-arg BENCH_VARIANT=lean \
-t benchmarks-lean \
-f src/backend/tests/benchmarks/Dockerfile \
.
- name: Build benchmarks-lean-uncompiled (strip .pyc from benchmarks-lean)
run: |
# Mirrors driver.py's Option A1 wrapper Dockerfile (plan 02-06 /
# 01-CONTEXT.md D-11a/D-12a). Produces the "uncompiled" variant by
# deleting __pycache__/.pyc/.pyo from the landed image's /app/.venv.
cat > /tmp/Dockerfile.uncompiled <<'DOCKERFILE'
FROM benchmarks-lean
RUN find /app/.venv -name '__pycache__' -type d -exec rm -rf {} + 2>/dev/null || true \
&& find /app/.venv -name '*.pyc' -delete \
&& find /app/.venv -name '*.pyo' -delete
DOCKERFILE
docker build -t benchmarks-lean-uncompiled -f /tmp/Dockerfile.uncompiled .
- name: Build lfx-reference image (CNT-01 reference Dockerfile)
run: |
# Builds the patched src/lfx/docker/Dockerfile image that Plan 05-01
# delivers (Python 3.13-alpine, --no-install-project layer separation,
# UV_COMPILE_BYTECODE=1). This is the image the lfx_reference_image
# scenario measures (CNT-01 authoritative signal).
docker build \
-t lfx-reference \
-f src/lfx/docker/Dockerfile \
.
- name: Verify deps layer cache (CNT-02 repeat-build assertion)
# After the initial lfx-reference build, make a no-op source change
# and re-run docker build. If Plan 05-01's --no-install-project patch works,
# the deps layer cache-hits and total build time is ~5-15s. If the patch was
# reverted or broken, the first uv sync reruns and this step exceeds the
# 30s target .
#
# Touches a trailing comment on src/lfx/src/lfx/__init__.py. The change is
# harmless (single-line comment; doesn't affect imports or module attributes)
# but invalidates the source COPY layer at Dockerfile line 39.
run: |
set -e
echo "# CNT-02 cache-verification no-op touch: $(date +%s)" >> src/lfx/src/lfx/__init__.py
start=$(date +%s)
docker build \
-t lfx-reference \
-f src/lfx/docker/Dockerfile \
.
end=$(date +%s)
elapsed=$((end - start))
echo "Repeat build elapsed: ${elapsed}s (CNT-02 target: <30s)"
if [ "$elapsed" -ge 30 ]; then
echo "::error ::CNT-02 FAILED: repeat build took ${elapsed}s (>=30s). The deps layer is NOT cache-hit. Verify --no-install-project is present on the first uv sync in src/lfx/docker/Dockerfile (Plan 05-01)."
exit 1
fi
# Restore src/lfx/src/lfx/__init__.py so downstream steps see the unmodified tree.
git checkout -- src/lfx/src/lfx/__init__.py
- name: Save all images to tarball
run: |
docker save benchmarks-lean benchmarks-lean-uncompiled lfx-reference | gzip > /tmp/images.tar.gz
ls -la /tmp/images.tar.gz
- name: Upload images artifact
uses: actions/upload-artifact@v6
with:
name: bench-images
path: /tmp/images.tar.gz
retention-days: 1
if-no-files-found: error
scenario:
name: "bench:${{ matrix.scenario }}"
needs: [mode, build-images]
strategy:
fail-fast: false
matrix:
scenario:
- lfx_bare
- lfx_with_flow
- lfx_with_flow_prebaked
# langflow_run_no_change_restart stays non-blocking until its
# self_measuring dispatch shape is stable under CI load; its
# thresholds.json entry is carried over from the 2026-04-20 snapshot
# because recent runs produce ~1ms garbage (harness bug, not a real
# measurement).
- langflow_run_http_ready
- langflow_run_no_change_restart
- lfx_reference_image
runs-on: ubuntu-latest
# Budgets:
# - langflow_run_http_ready: 10 min. Single cold boot to TCP-ready
# (~30-40s on Linux CI) plus hyperfine's 5 runs.
# - langflow_run_no_change_restart: 15 min. Its supervisor runs TWO full
# langflow boots per hyperfine iteration (pre-warm + measured), and
# hyperfine defaults to runs=5, so ~10 boots at ~35s each on Linux CI.
# - all others: 20 min.
timeout-minutes: ${{ matrix.scenario == 'langflow_run_http_ready' && 10 || (matrix.scenario == 'langflow_run_no_change_restart' && 15 || 20) }}
# Matrix-entry-level continue-on-error:
# - langflow_run_no_change_restart: stays non-blocking until its
# self_measuring dispatch is fixed (currently produces ~1ms garbage).
# - lfx_reference_image: stays non-blocking per D-12 amendment.
continue-on-error: ${{ matrix.scenario == 'langflow_run_no_change_restart' || matrix.scenario == 'lfx_reference_image' }}
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.ref || github.ref }}
- name: Set up Python 3.13
uses: actions/setup-python@v6
with:
python-version: "3.13"
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
version: "latest"
- name: Install hyperfine
run: |
sudo apt-get update
sudo apt-get install -y hyperfine
- name: Install benchmark dependencies
run: |
uv sync --group benchmarks
- name: Download prebuilt images
uses: actions/download-artifact@v6
with:
name: bench-images
path: /tmp
- name: Load images into docker
run: |
gunzip -c /tmp/images.tar.gz | docker load
docker images | grep benchmarks
- name: Run scenario
id: bench
continue-on-error: ${{ matrix.scenario == 'langflow_run_no_change_restart' || matrix.scenario == 'lfx_reference_image' }}
env:
CONTAINER_CMD: docker
run: |
mode="${{ needs.mode.outputs.mode }}"
scenario="${{ matrix.scenario }}"
echo "Running scenario=$scenario in mode=$mode"
verify_flag=""
if [ "$mode" = "verify" ]; then
verify_flag="--verify"
fi
uv run python -m src.backend.tests.benchmarks.driver \
--mode docker \
--scenarios "$scenario" \
$verify_flag \
--output-dir src/backend/tests/benchmarks/reports \
--skip-build
- name: Upload scenario artifact
if: always()
uses: actions/upload-artifact@v6
with:
name: scenario-${{ matrix.scenario }}
path: src/backend/tests/benchmarks/reports/
if-no-files-found: warn
- name: Post regression comment on failure
if: >-
failure() &&
github.event.pull_request &&
matrix.scenario != 'langflow_run_no_change_restart' &&
matrix.scenario != 'lfx_reference_image'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
if [ -f src/backend/tests/benchmarks/reports/regression_comment.md ]; then
gh pr comment "${{ github.event.pull_request.number }}" \
--body-file src/backend/tests/benchmarks/reports/regression_comment.md
else
echo "No regression_comment.md produced by driver for scenario ${{ matrix.scenario }}. Inspect the harness step logs above."
fi
aggregate:
name: "Aggregate"
if: always() && needs.mode.result == 'success'
needs: [mode, scenario]
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
ref: ${{ github.event.inputs.ref || github.ref }}
- name: Download all scenario artifacts
uses: actions/download-artifact@v6
with:
path: all-scenarios
pattern: scenario-*
merge-multiple: false
- name: Merge scenario reports
run: |
mkdir -p src/backend/tests/benchmarks/reports
# Each scenario uploaded its entire reports/ dir under its own name.
# Merge them into one dir for the snapshot step (or the uploaded rollup).
for dir in all-scenarios/scenario-*; do
[ -d "$dir" ] || continue
cp -rv "$dir"/. src/backend/tests/benchmarks/reports/ || true
done
echo "=== Merged reports/ contents ==="
ls -la src/backend/tests/benchmarks/reports/ || true
- name: Set up Python 3.13
uses: actions/setup-python@v6
with:
python-version: "3.13"
- name: Install uv
uses: astral-sh/setup-uv@v6
with:
version: "latest"
- name: Install benchmark dependencies
run: |
uv sync --group benchmarks
- name: Snapshot mode - assemble thresholds.json
if: needs.mode.outputs.mode == 'snapshot'
env:
CAPTURED_REF: ${{ github.ref_name }}@${{ github.sha }}
CAPTURED_RUNNER: ubuntu-latest (GitHub Actions)
CAPTURED_ON: ${{ github.event.repository.updated_at || '' }}
run: |
# Read per-scenario hyperfine JSONs produced by matrix jobs, assemble
# thresholds.json. Does not re-invoke the driver.
python3 <<'PY'
import json, os, pathlib, sys
from datetime import datetime, timezone
reports = pathlib.Path("src/backend/tests/benchmarks/reports")
out_thresholds = pathlib.Path("src/backend/tests/benchmarks/thresholds.json")
tracked = [
"lfx_bare",
"lfx_with_flow",
"lfx_with_flow_prebaked",
"langflow_run_http_ready",
"langflow_run_no_change_restart",
"lfx_reference_image",
]
scenarios_out = {}
for name in tracked:
hfile = reports / f"{name}.json"
if not hfile.exists() or hfile.stat().st_size == 0:
print(f"WARNING: {hfile} missing or empty; writing sentinel for {name}", file=sys.stderr)
scenarios_out[name] = {"mean_ms": 0, "stddev_ms": 0, "runs": 0}
continue
try:
data = json.loads(hfile.read_text())
except json.JSONDecodeError as exc:
print(f"WARNING: {hfile} is malformed JSON ({exc}); sentinel for {name}", file=sys.stderr)
scenarios_out[name] = {"mean_ms": 0, "stddev_ms": 0, "runs": 0}
continue
results = data.get("results", [])
if not results:
print(f"WARNING: {hfile} has no results; sentinel for {name}", file=sys.stderr)
scenarios_out[name] = {"mean_ms": 0, "stddev_ms": 0, "runs": 0}
continue
r = results[0]
times = r.get("times", [])
scenarios_out[name] = {
"mean_ms": round(r.get("mean", 0) * 1000, 2),
"stddev_ms": round(r.get("stddev", 0) * 1000, 2),
"runs": len(times),
}
thresholds = {
"schema_version": 1,
"measurement_mode": "bytecode_compile_delta",
"captured_on": datetime.now(timezone.utc).date().isoformat(),
"captured_ref": os.environ.get("CAPTURED_REF", "unknown"),
"captured_runner": os.environ.get("CAPTURED_RUNNER", "ubuntu-latest (GitHub Actions)"),
"python_version": "3.13",
"allowed_regression_pct": 15,
"_note": (
"Captured on Linux CI via the cold-start-benchmark matrix workflow. "
"measurement_mode is bytecode_compile_delta."
),
"scenarios": scenarios_out,
}
tmp = out_thresholds.with_suffix(out_thresholds.suffix + ".tmp")
tmp.write_text(json.dumps(thresholds, indent=2) + "\n", encoding="utf-8")
tmp.replace(out_thresholds)
print("Wrote thresholds.json:")
print(json.dumps(thresholds, indent=2))
PY
- name: Verify mode — final gate summary
if: needs.mode.outputs.mode == 'verify'
run: |
# Per-scenario verify already ran in the matrix jobs (each one exited
# non-zero on regression). This step just summarizes the results for
# the workflow-run UI.
echo "## Verify mode results" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
# Inspect the needs.scenario.result for each matrix cell in aggregate.
# GitHub doesn't expose per-cell results directly, so we read file
# existence of regression_comment.md as the signal.
if ls src/backend/tests/benchmarks/reports/regression_comment*.md 1> /dev/null 2>&1; then
echo "One or more scenarios regressed. See the bot comments on the PR." >> "$GITHUB_STEP_SUMMARY"
exit 1
else
echo "All scenarios within tolerance ✓" >> "$GITHUB_STEP_SUMMARY"
fi
- name: Upload final rollup
if: always()
uses: actions/upload-artifact@v6
with:
name: cold-start-benchmark-reports
path: |
src/backend/tests/benchmarks/reports/
src/backend/tests/benchmarks/thresholds.json
if-no-files-found: warn
- name: Record override label usage
if: always() && contains(github.event.pull_request.labels.*.name, 'benchmarks:override')
run: |
{
echo "## benchmarks:override applied"
echo ""
echo "PR #${{ github.event.pull_request.number }} carries the \`benchmarks:override\` label."
echo "Actor: ${{ github.actor }}"
echo "Event: ${{ github.event_name }}"
echo ""
echo "The override does NOT bypass the workflow's failure status; it is recorded here for audit."
} >> "$GITHUB_STEP_SUMMARY"
- name: Snapshot commit note
if: >-
success() && needs.mode.outputs.mode == 'snapshot'
run: |
{
echo "## Snapshot run complete"
echo ""
echo "The aggregation step assembled src/backend/tests/benchmarks/thresholds.json from the per-scenario matrix artifacts."
echo "Download the \`cold-start-benchmark-reports\` artifact and commit the updated thresholds.json"
echo "plus the baseline md/json files manually in a PR titled something like"
echo "\`docs(bench): capture release-1.9.0 baseline thresholds\`."
echo ""
echo "Per Pitfall 11 this snapshot is authoritative only because it ran on the GHA Linux runner."
} >> "$GITHUB_STEP_SUMMARY"