langflow/.github/workflows/cold-start-benchmark.yml at 11c4f2cf19cd8fe4e87c7a0fbf263b45106e571c · langflow-ai/langflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
name: Cold Start Benchmark

# Label-gated cold-start regression gate (Phase 1 / MEAS-08; D-14, D-16, D-17).
# Runs only when:
#   1. A PR carries the `run-benchmarks` label (verify mode: fail if regression exceeds threshold).
#   2. A PR carries the `run-benchmark-snapshot` label (snapshot mode: capture new authoritative baseline).
#   3. A commit is pushed to a branch matching `cold-start/**`.
#   4. A maintainer triggers the workflow manually via workflow_dispatch.
#
# Workflow structure: scenarios run in parallel matrix jobs so a single slow
# scenario cannot exhaust the 30-minute job budget for the whole pipeline.
# Each matrix job owns ONE scenario; a final aggregation job assembles results.
#
# Regression policy: any matrix job with a regression fails the workflow
# AND posts a bot comment on the PR with the numbers diff sourced from the
# scenario's reports/regression_comment.md.

on:
  workflow_dispatch:
    inputs:
      mode:
        description: "verify: run gate against thresholds.json. snapshot: capture new baseline."
        required: false
        default: verify
        type: choice
        options:
          - verify
          - snapshot
      ref:
        description: "Git ref to checkout (branch, tag, or commit SHA)."
        required: false
        default: ""
        type: string
  pull_request:
    types: [opened, labeled, synchronize, reopened]
  push:
    branches:
      - "cold-start/**"

permissions:
  contents: read
  pull-requests: write

jobs:
  # One-shot job that decides mode once for the whole workflow and emits it
  # as a job output. Matrix jobs read this instead of each re-evaluating the
  # (messy) PR-label + inputs + event-name expression.
  mode:
    name: "Resolve mode"
    if: >-
      contains(github.event.pull_request.labels.*.name, 'run-benchmarks') ||
      contains(github.event.pull_request.labels.*.name, 'run-benchmark-snapshot') ||
      github.event_name == 'workflow_dispatch' ||
      github.event_name == 'push'
    runs-on: ubuntu-latest
    timeout-minutes: 2
    outputs:
      mode: ${{ steps.resolve.outputs.mode }}
    steps:
      - id: resolve
        run: |
          # Mode selection precedence:
          #   1. workflow_dispatch input (maintainer-specified)
          #   2. run-benchmark-snapshot label on the PR (snapshot mode)
          #   3. default: verify mode (regression gate)
          if [ -n "${{ github.event.inputs.mode }}" ]; then
            mode="${{ github.event.inputs.mode }}"
          elif ${{ contains(github.event.pull_request.labels.*.name, 'run-benchmark-snapshot') }}; then
            mode="snapshot"
          else
            mode="verify"
          fi
          echo "mode=$mode" >> "$GITHUB_OUTPUT"
          echo "Resolved mode: $mode"

  # Build benchmarks-lean AND benchmarks-lean-uncompiled once, save both as a
  # tarball, upload as an artifact. Matrix jobs `docker load` it. Cuts wall-
  # clock ~6 minutes and avoids the "--skip-build needs both images present"
  # failure mode.
  build-images:
    name: "Build images"
    needs: mode
    runs-on: ubuntu-latest
    timeout-minutes: 15
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.ref || github.ref }}

      - name: Build benchmarks-lean
        run: |
          docker build \
            --build-arg BENCH_VARIANT=lean \
            -t benchmarks-lean \
            -f src/backend/tests/benchmarks/Dockerfile \
            .

      - name: Build benchmarks-lean-uncompiled (strip .pyc from benchmarks-lean)
        run: |
          # Mirrors driver.py's Option A1 wrapper Dockerfile (plan 02-06 /
          # 01-CONTEXT.md D-11a/D-12a). Produces the "uncompiled" variant by
          # deleting __pycache__/.pyc/.pyo from the landed image's /app/.venv.
          cat > /tmp/Dockerfile.uncompiled <<'DOCKERFILE'
          FROM benchmarks-lean
          RUN find /app/.venv -name '__pycache__' -type d -exec rm -rf {} + 2>/dev/null || true \
              && find /app/.venv -name '*.pyc' -delete \
              && find /app/.venv -name '*.pyo' -delete
          DOCKERFILE
          docker build -t benchmarks-lean-uncompiled -f /tmp/Dockerfile.uncompiled .

      - name: Build lfx-reference image (CNT-01 reference Dockerfile)
        run: |
          # Builds the patched src/lfx/docker/Dockerfile image that Plan 05-01
          # delivers (Python 3.13-alpine, --no-install-project layer separation,
          # UV_COMPILE_BYTECODE=1). This is the image the lfx_reference_image
          # scenario measures (CNT-01 authoritative signal).
          docker build \
            -t lfx-reference \
            -f src/lfx/docker/Dockerfile \
            .

      - name: Verify deps layer cache (CNT-02 repeat-build assertion)
        # After the initial lfx-reference build, make a no-op source change
        # and re-run docker build. If Plan 05-01's --no-install-project patch works,
        # the deps layer cache-hits and total build time is ~5-15s. If the patch was
        # reverted or broken, the first uv sync reruns and this step exceeds the
        # 30s target .
        #
        # Touches a trailing comment on src/lfx/src/lfx/__init__.py. The change is
        # harmless (single-line comment; doesn't affect imports or module attributes)
        # but invalidates the source COPY layer at Dockerfile line 39.
        run: |
          set -e
          echo "# CNT-02 cache-verification no-op touch: $(date +%s)" >> src/lfx/src/lfx/__init__.py
          start=$(date +%s)
          docker build \
            -t lfx-reference \
            -f src/lfx/docker/Dockerfile \
            .
          end=$(date +%s)
          elapsed=$((end - start))
          echo "Repeat build elapsed: ${elapsed}s (CNT-02 target: <30s)"
          if [ "$elapsed" -ge 30 ]; then
            echo "::error ::CNT-02 FAILED: repeat build took ${elapsed}s (>=30s). The deps layer is NOT cache-hit. Verify --no-install-project is present on the first uv sync in src/lfx/docker/Dockerfile (Plan 05-01)."
            exit 1
          fi
          # Restore src/lfx/src/lfx/__init__.py so downstream steps see the unmodified tree.
          git checkout -- src/lfx/src/lfx/__init__.py

      - name: Save all images to tarball
        run: |
          docker save benchmarks-lean benchmarks-lean-uncompiled lfx-reference | gzip > /tmp/images.tar.gz
          ls -la /tmp/images.tar.gz

      - name: Upload images artifact
        uses: actions/upload-artifact@v6
        with:
          name: bench-images
          path: /tmp/images.tar.gz
          retention-days: 1
          if-no-files-found: error

  scenario:
    name: "bench:${{ matrix.scenario }}"
    needs: [mode, build-images]
    strategy:
      fail-fast: false
      matrix:
        scenario:
          - lfx_bare
          - lfx_with_flow
          - lfx_with_flow_prebaked
          # langflow_run_no_change_restart stays non-blocking until its
          # self_measuring dispatch shape is stable under CI load; its
          # thresholds.json entry is carried over from the 2026-04-20 snapshot
          # because recent runs produce ~1ms garbage (harness bug, not a real
          # measurement).
          - langflow_run_http_ready
          - langflow_run_no_change_restart
          - lfx_reference_image
    runs-on: ubuntu-latest
    # Budgets:
    #   - langflow_run_http_ready: 10 min. Single cold boot to TCP-ready
    #     (~30-40s on Linux CI) plus hyperfine's 5 runs.
    #   - langflow_run_no_change_restart: 15 min. Its supervisor runs TWO full
    #     langflow boots per hyperfine iteration (pre-warm + measured), and
    #     hyperfine defaults to runs=5, so ~10 boots at ~35s each on Linux CI.
    #   - all others: 20 min.
    timeout-minutes: ${{ matrix.scenario == 'langflow_run_http_ready' && 10 || (matrix.scenario == 'langflow_run_no_change_restart' && 15 || 20) }}
    # Matrix-entry-level continue-on-error:
    #   - langflow_run_no_change_restart: stays non-blocking until its
    #     self_measuring dispatch is fixed (currently produces ~1ms garbage).
    #   - lfx_reference_image: stays non-blocking per D-12 amendment.
    continue-on-error: ${{ matrix.scenario == 'langflow_run_no_change_restart' || matrix.scenario == 'lfx_reference_image' }}
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.ref || github.ref }}

      - name: Set up Python 3.13
        uses: actions/setup-python@v6
        with:
          python-version: "3.13"

      - name: Install uv
        uses: astral-sh/setup-uv@v6
        with:
          version: "latest"

      - name: Install hyperfine
        run: |
          sudo apt-get update
          sudo apt-get install -y hyperfine

      - name: Install benchmark dependencies
        run: |
          uv sync --group benchmarks

      - name: Download prebuilt images
        uses: actions/download-artifact@v6
        with:
          name: bench-images
          path: /tmp

      - name: Load images into docker
        run: |
          gunzip -c /tmp/images.tar.gz | docker load
          docker images | grep benchmarks

      - name: Run scenario
        id: bench
        continue-on-error: ${{ matrix.scenario == 'langflow_run_no_change_restart' || matrix.scenario == 'lfx_reference_image' }}
        env:
          CONTAINER_CMD: docker
        run: |
          mode="${{ needs.mode.outputs.mode }}"
          scenario="${{ matrix.scenario }}"
          echo "Running scenario=$scenario in mode=$mode"
          verify_flag=""
          if [ "$mode" = "verify" ]; then
            verify_flag="--verify"
          fi
          uv run python -m src.backend.tests.benchmarks.driver \
            --mode docker \
            --scenarios "$scenario" \
            $verify_flag \
            --output-dir src/backend/tests/benchmarks/reports \
            --skip-build

      - name: Upload scenario artifact
        if: always()
        uses: actions/upload-artifact@v6
        with:
          name: scenario-${{ matrix.scenario }}
          path: src/backend/tests/benchmarks/reports/
          if-no-files-found: warn

      - name: Post regression comment on failure
        if: >-
          failure() &&
          github.event.pull_request &&
          matrix.scenario != 'langflow_run_no_change_restart' &&
          matrix.scenario != 'lfx_reference_image'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          if [ -f src/backend/tests/benchmarks/reports/regression_comment.md ]; then
            gh pr comment "${{ github.event.pull_request.number }}" \
              --body-file src/backend/tests/benchmarks/reports/regression_comment.md
          else
            echo "No regression_comment.md produced by driver for scenario ${{ matrix.scenario }}. Inspect the harness step logs above."
          fi

  aggregate:
    name: "Aggregate"
    if: always() && needs.mode.result == 'success'
    needs: [mode, scenario]
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.ref || github.ref }}

      - name: Download all scenario artifacts
        uses: actions/download-artifact@v6
        with:
          path: all-scenarios
          pattern: scenario-*
          merge-multiple: false

      - name: Merge scenario reports
        run: |
          mkdir -p src/backend/tests/benchmarks/reports
          # Each scenario uploaded its entire reports/ dir under its own name.
          # Merge them into one dir for the snapshot step (or the uploaded rollup).
          for dir in all-scenarios/scenario-*; do
            [ -d "$dir" ] || continue
            cp -rv "$dir"/. src/backend/tests/benchmarks/reports/ || true
          done
          echo "=== Merged reports/ contents ==="
          ls -la src/backend/tests/benchmarks/reports/ || true

      - name: Set up Python 3.13
        uses: actions/setup-python@v6
        with:
          python-version: "3.13"

      - name: Install uv
        uses: astral-sh/setup-uv@v6
        with:
          version: "latest"

      - name: Install benchmark dependencies
        run: |
          uv sync --group benchmarks

      - name: Snapshot mode - assemble thresholds.json
        if: needs.mode.outputs.mode == 'snapshot'
        env:
          CAPTURED_REF: ${{ github.ref_name }}@${{ github.sha }}
          CAPTURED_RUNNER: ubuntu-latest (GitHub Actions)
          CAPTURED_ON: ${{ github.event.repository.updated_at || '' }}
        run: |
          # Read per-scenario hyperfine JSONs produced by matrix jobs, assemble
          # thresholds.json. Does not re-invoke the driver.
          python3 <<'PY'
          import json, os, pathlib, sys
          from datetime import datetime, timezone

          reports = pathlib.Path("src/backend/tests/benchmarks/reports")
          out_thresholds = pathlib.Path("src/backend/tests/benchmarks/thresholds.json")

          tracked = [
              "lfx_bare",
              "lfx_with_flow",
              "lfx_with_flow_prebaked",
              "langflow_run_http_ready",
              "langflow_run_no_change_restart",
              "lfx_reference_image",
          ]

          scenarios_out = {}
          for name in tracked:
              hfile = reports / f"{name}.json"
              if not hfile.exists() or hfile.stat().st_size == 0:
                  print(f"WARNING: {hfile} missing or empty; writing sentinel for {name}", file=sys.stderr)
                  scenarios_out[name] = {"mean_ms": 0, "stddev_ms": 0, "runs": 0}
                  continue
              try:
                  data = json.loads(hfile.read_text())
              except json.JSONDecodeError as exc:
                  print(f"WARNING: {hfile} is malformed JSON ({exc}); sentinel for {name}", file=sys.stderr)
                  scenarios_out[name] = {"mean_ms": 0, "stddev_ms": 0, "runs": 0}
                  continue
              results = data.get("results", [])
              if not results:
                  print(f"WARNING: {hfile} has no results; sentinel for {name}", file=sys.stderr)
                  scenarios_out[name] = {"mean_ms": 0, "stddev_ms": 0, "runs": 0}
                  continue
              r = results[0]
              times = r.get("times", [])
              scenarios_out[name] = {
                  "mean_ms": round(r.get("mean", 0) * 1000, 2),
                  "stddev_ms": round(r.get("stddev", 0) * 1000, 2),
                  "runs": len(times),
              }

          thresholds = {
              "schema_version": 1,
              "measurement_mode": "bytecode_compile_delta",
              "captured_on": datetime.now(timezone.utc).date().isoformat(),
              "captured_ref": os.environ.get("CAPTURED_REF", "unknown"),
              "captured_runner": os.environ.get("CAPTURED_RUNNER", "ubuntu-latest (GitHub Actions)"),
              "python_version": "3.13",
              "allowed_regression_pct": 15,
              "_note": (
                  "Captured on Linux CI via the cold-start-benchmark matrix workflow. "
                  "measurement_mode is bytecode_compile_delta."
              ),
              "scenarios": scenarios_out,
          }

          tmp = out_thresholds.with_suffix(out_thresholds.suffix + ".tmp")
          tmp.write_text(json.dumps(thresholds, indent=2) + "\n", encoding="utf-8")
          tmp.replace(out_thresholds)

          print("Wrote thresholds.json:")
          print(json.dumps(thresholds, indent=2))
          PY

      - name: Verify mode — final gate summary
        if: needs.mode.outputs.mode == 'verify'
        run: |
          # Per-scenario verify already ran in the matrix jobs (each one exited
          # non-zero on regression). This step just summarizes the results for
          # the workflow-run UI.
          echo "## Verify mode results" >> "$GITHUB_STEP_SUMMARY"
          echo "" >> "$GITHUB_STEP_SUMMARY"
          # Inspect the needs.scenario.result for each matrix cell in aggregate.
          # GitHub doesn't expose per-cell results directly, so we read file
          # existence of regression_comment.md as the signal.
          if ls src/backend/tests/benchmarks/reports/regression_comment*.md 1> /dev/null 2>&1; then
            echo "One or more scenarios regressed. See the bot comments on the PR." >> "$GITHUB_STEP_SUMMARY"
            exit 1
          else
            echo "All scenarios within tolerance ✓" >> "$GITHUB_STEP_SUMMARY"
          fi

      - name: Upload final rollup
        if: always()
        uses: actions/upload-artifact@v6
        with:
          name: cold-start-benchmark-reports
          path: |
            src/backend/tests/benchmarks/reports/
            src/backend/tests/benchmarks/thresholds.json
          if-no-files-found: warn

      - name: Record override label usage
        if: always() && contains(github.event.pull_request.labels.*.name, 'benchmarks:override')
        run: |
          {
            echo "## benchmarks:override applied"
            echo ""
            echo "PR #${{ github.event.pull_request.number }} carries the \`benchmarks:override\` label."
            echo "Actor: ${{ github.actor }}"
            echo "Event: ${{ github.event_name }}"
            echo ""
            echo "The override does NOT bypass the workflow's failure status; it is recorded here for audit."
          } >> "$GITHUB_STEP_SUMMARY"

      - name: Snapshot commit note
        if: >-
          success() && needs.mode.outputs.mode == 'snapshot'
        run: |
          {
            echo "## Snapshot run complete"
            echo ""
            echo "The aggregation step assembled src/backend/tests/benchmarks/thresholds.json from the per-scenario matrix artifacts."
            echo "Download the \`cold-start-benchmark-reports\` artifact and commit the updated thresholds.json"
            echo "plus the baseline md/json files manually in a PR titled something like"
            echo "\`docs(bench): capture release-1.9.0 baseline thresholds\`."
            echo ""
            echo "Per Pitfall 11 this snapshot is authoritative only because it ran on the GHA Linux runner."
          } >> "$GITHUB_STEP_SUMMARY"