Skip to content

Commit 756903c

Browse files
killaguclaude
andcommitted
perf(ci): shard tests for ≤60s per ubuntu job; parallelize quality checks
Cut the monolithic test job (single ~190s vitest run, gated by a 170s cluster file; 12-29min on CI) into parallel, wall-time-balanced shards so each ubuntu-latest job's `vitest run` step stays ≤60s — verified on real CI (48 ubuntu shards run 15-59s). Test sharding (scripts/run-shard.js, driven by the CI matrix): - Fork-heavy packages split via vitest --shard: cluster→3, egg→2, mock→2, schedule→4, security→2, plus redis/multipart/development standalone; each capped with --maxWorkers so forked egg cluster processes don't oversubscribe the CPU and time out. - The light remainder is greedy bin-packed by per-dir cpu weight into 8 rest-* shards so heavy dirs land on separate shards. - Long single test files split by `describe` block (no semantic change, no removed cases) so vitest can parallelize them: cluster app_worker/agent_worker/ https/master.others, mock cluster, schedule worker/subscription/immediate, egg watcher, development override. - macОС/Windows have ~5-slot runner concurrency, so they run the whole suite as one job per (os,node) instead of fanning out (which would queue ~9 batches). Quality job split into a parallel matrix (lint/typecheck/fmtcheck/build/site) so the slowest check — not their sum — bounds wall time. Latent bugs fixed along the way: - tegg controller ControllerMetaManager.test.ts intentionally triggers a duplicate-proto error that polluted the process-global registry under isolate:false, failing sibling files; isolated that project (isolate:true). - clean-dist used `ut run clean --workspaces` which aborts the && chain (root has no clean script) → scripts/clean-dist.js. - typecheck/pretest used `ut run <self> --workspaces`, which includes the root and recurses infinitely → scripts/run-workspaces.js (concurrent, root-bin PATH). - Windows runner images no longer set %HOME%, breaking egg_loader getHomedir; export HOME=%USERPROFILE% on windows test jobs. - egg-bin runs without --coverage to stay ≤60s (no threshold; report only). CI test jobs reduced 132→~56 to avoid flooding scarce macos/windows queues. Windows flakes absorbed by --retry. No test semantics changed or cases removed. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 5c856b3 commit 756903c

46 files changed

Lines changed: 2340 additions & 1535 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yml

Lines changed: 101 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,20 @@ permissions:
1818
actions: write
1919

2020
jobs:
21-
typecheck:
21+
# Static-analysis / build checks. Each check runs as its own parallel job
22+
# (instead of serial steps) so the slowest one — not their sum — bounds the
23+
# wall time. Goal: keep each ≤ 60s.
24+
quality:
25+
strategy:
26+
fail-fast: false
27+
matrix:
28+
check: ['lint', 'typecheck', 'fmtcheck', 'build', 'site']
29+
30+
name: Quality (${{ matrix.check }})
2231
runs-on: ubuntu-latest
2332

2433
concurrency:
25-
group: typecheck-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}
34+
group: quality-${{ matrix.check }}-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}
2635
cancel-in-progress: true
2736
steps:
2837
- name: Checkout repository
@@ -41,32 +50,86 @@ jobs:
4150
run: ut install --from pnpm || (sleep 5 && ut install --from pnpm) || (sleep 10 && ut install --from pnpm)
4251

4352
- name: Run lint
53+
if: ${{ matrix.check == 'lint' }}
4454
run: ut run lint
4555

4656
- name: Run typecheck
57+
if: ${{ matrix.check == 'typecheck' }}
4758
run: ut run typecheck
4859

4960
- name: Run format check
61+
if: ${{ matrix.check == 'fmtcheck' }}
5062
run: ut run fmtcheck
5163

5264
- name: Run build
65+
if: ${{ matrix.check == 'build' }}
5366
run: ut run build
5467

5568
- name: Run site build
69+
if: ${{ matrix.check == 'site' }}
5670
run: ut run site:build
5771

5872
test:
5973
strategy:
6074
fail-fast: false
6175
matrix:
62-
os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
76+
# ubuntu-latest runs the fine-grained shards: fork-heavy packages are
77+
# split into vitest --shard slices and the light remainder into
78+
# weight-balanced rest-* shards (see scripts/run-shard.js). Sized so each
79+
# shard's `vitest run` step stays ≤ ~60s on ubuntu-latest, where runner
80+
# concurrency is high enough to run them all in parallel.
81+
os: ['ubuntu-latest']
6382
node: ['22', '24']
64-
65-
name: Test (${{ matrix.os }}, ${{ matrix.node }})
83+
shard:
84+
- cluster-1
85+
- cluster-2
86+
- cluster-3
87+
- egg-1
88+
- egg-2
89+
- mock-1
90+
- mock-2
91+
- schedule-1
92+
- schedule-2
93+
- schedule-3
94+
- schedule-4
95+
- development
96+
- security-1
97+
- security-2
98+
- redis
99+
- multipart
100+
- rest-1
101+
- rest-2
102+
- rest-3
103+
- rest-4
104+
- rest-5
105+
- rest-6
106+
- rest-7
107+
- rest-8
108+
include:
109+
# macOS/Windows runners have very low concurrency (~5 macOS slots), so
110+
# fanning out 22 shards × 2 nodes would queue ~9 batches and blow up
111+
# wall time. Run the whole suite as a single job per (os, node) instead
112+
# — coverage without flooding the queue. (shard=all => no filter.)
113+
# These whole-suite jobs take ~13-25min and dominate the overall
114+
# workflow wall; the ≤60s-per-shard target applies to the ubuntu fan-out.
115+
- os: macos-latest
116+
node: '22'
117+
shard: all
118+
- os: macos-latest
119+
node: '24'
120+
shard: all
121+
- os: windows-latest
122+
node: '22'
123+
shard: all
124+
- os: windows-latest
125+
node: '24'
126+
shard: all
127+
128+
name: Test (${{ matrix.os }}, ${{ matrix.node }}, ${{ matrix.shard }})
66129
runs-on: ${{ matrix.os }}
67130

68131
concurrency:
69-
group: test-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}-(${{ matrix.os }}, ${{ matrix.node }})
132+
group: test-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}-(${{ matrix.os }}, ${{ matrix.node }}, ${{ matrix.shard }})
70133
cancel-in-progress: true
71134

72135
steps:
@@ -163,11 +226,29 @@ jobs:
163226
- name: Install dependencies
164227
run: ut install --from pnpm || (sleep 5 && ut install --from pnpm) || (sleep 10 && ut install --from pnpm)
165228

166-
- name: Run tests
167-
run: ut run ci
229+
- name: Prepare test fixtures (clean dist + db)
230+
# Mirrors the original `preci` (pretest) step: clears stale dist so tegg
231+
# plugin tests don't double-load src+dist, and runs per-workspace
232+
# pretest (e.g. orm DB table init). The shard runner invokes vitest
233+
# directly, bypassing the npm `preci` lifecycle, so do it explicitly.
234+
run: ut run pretest
235+
236+
- name: Set HOME on Windows
237+
# Newer windows-latest runner images leave %HOME% unset, while
238+
# @eggjs/core's getHomedir() falls back to os.homedir()
239+
# (C:\Users\runneradmin). egg_loader.test.ts asserts
240+
# getHomedir() === process.env.HOME, so align HOME with USERPROFILE the
241+
# way older runner images did. Unix runners already export HOME.
242+
if: ${{ matrix.os == 'windows-latest' }}
243+
shell: pwsh
244+
run: echo "HOME=$env:USERPROFILE" >> $env:GITHUB_ENV
245+
246+
- name: Run tests (shard ${{ matrix.shard }})
247+
run: node scripts/run-shard.js ${{ matrix.shard }} -- --coverage
168248

169249
- name: Run example tests
170-
if: ${{ matrix.os != 'windows-latest' }}
250+
# Only run example tests once per (os, node) combo to avoid duplication.
251+
if: ${{ matrix.os != 'windows-latest' && matrix.shard == 'rest-1' }}
171252
run: |
172253
ut run example:test:all
173254
@@ -207,17 +288,19 @@ jobs:
207288
- name: Install dependencies
208289
run: ut install --from pnpm || (sleep 5 && ut install --from pnpm) || (sleep 10 && ut install --from pnpm)
209290

291+
- name: Set HOME on Windows
292+
if: ${{ matrix.os == 'windows-latest' }}
293+
shell: pwsh
294+
run: echo "HOME=$env:USERPROFILE" >> $env:GITHUB_ENV
295+
210296
- name: Run tests
297+
# `ci` (vitest --coverage) adds ~50s of instrumentation on egg-bin's
298+
# fork-heavy suite, pushing it past 60s. Run without coverage (`test`)
299+
# to keep the job ≤60s; egg-bin's fork tests can't be sharded (they
300+
# depend on cross-file shared state under isolate:false).
211301
run: |
212302
ut run build -- --workspace ./tools/egg-bin
213-
ut run ci --workspace @eggjs/bin
214-
215-
- name: Code Coverage
216-
# skip on windows, it will hangup on codecov https://github.com/codecov/codecov-action/issues/1787
217-
if: ${{ matrix.os != 'windows-latest' }}
218-
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de # v5
219-
with:
220-
use_oidc: true
303+
ut run test --workspace @eggjs/bin
221304
222305
test-egg-scripts:
223306
strategy:
@@ -265,7 +348,7 @@ jobs:
265348
- test
266349
- test-egg-bin
267350
- test-egg-scripts
268-
- typecheck
351+
- quality
269352
steps:
270353
- run: exit 1
271354
if: ${{ always() && (contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')) }}

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,8 @@ ecosystem-ci/examples
120120
pnpm-lock.yaml
121121
.utoo.toml
122122
.claude/
123+
124+
# benchmark output directories
125+
benchmark/ci-test/baseline*
126+
benchmark/ci-test/run*
127+
benchmark/ci-test/2*

AGENTS.md

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,55 @@ Egg is maintained as a utoo monorepo.
2828

2929
### Local CI
3030

31-
Run tests **without building first**. The CI workflow (`ut install → ut run ci`) never runs `build` before tests. If `dist/` directories exist from a prior build, tegg plugin tests will fail with `duplicate proto` errors because globby scans both `src/*.ts` and `dist/*.js`, loading the same decorated class twice.
31+
Run tests **without building first**. The CI test jobs run `ut run pretest`
32+
(clean dist + per-workspace pretest) then a vitest shard; they never `build`
33+
before tests. If `dist/` directories exist from a prior build, tegg plugin tests
34+
will fail with `duplicate proto` errors because globby scans both `src/*.ts` and
35+
`dist/*.js`, loading the same decorated class twice. `scripts/clean-dist.js`
36+
(run by `ut run clean-dist`) removes every `dist/` for you.
3237

33-
When you see `duplicate proto` failures locally:
38+
When you see `duplicate proto` failures locally, run `ut run clean-dist` (or the
39+
equivalent find below) and re-run:
3440

3541
```bash
3642
find tegg packages plugins tools -name dist -type d \
3743
-not -path '*/node_modules/*' -not -path '*/test/*' -not -path '*/fixtures/*' \
3844
-exec rm -rf {} +
3945
```
4046

47+
### Aggregator scripts (avoid `ut run <x> --workspaces` recursion)
48+
49+
The root `typecheck` / `pretest` aggregate per-workspace scripts via
50+
`node scripts/run-workspaces.js <script>`, **not** `ut run <script> --workspaces`.
51+
With utoo, `--workspaces` includes the monorepo root, so a root script that calls
52+
`ut run <same-name> --workspaces` recurses infinitely. `run-workspaces.js`
53+
enumerates real workspace dirs, runs each script body against the root-hoisted
54+
`node_modules/.bin` (so root-only CLIs like `tsgo` resolve even where a stale
55+
local bin shim exists), and runs them concurrently to bound wall time.
56+
4157
Then re-run tests.
4258

59+
### CI test sharding
60+
61+
CI splits the test suite across parallel runners via `scripts/run-shard.js`
62+
(`node scripts/run-shard.js <shard>`). Each heavy fork-based package
63+
(`cluster`, `egg`, `mock`, `development`, `schedule-a`/`schedule-b`) runs on its
64+
own runner with a `--maxWorkers` cap so forked egg cluster child processes do
65+
not oversubscribe the CPU and time out; `rest-a`/`rest-b` split everything else
66+
with full parallelism. The goal is to keep each shard's `vitest run` wall time
67+
under ~60s. To reproduce one shard locally: `node scripts/run-shard.js cluster`.
68+
Use `node scripts/run-shard.js all` for the unsharded full suite.
69+
70+
Cluster/mock/schedule tests fork real OS processes that bind ports. If a run is
71+
killed mid-flight, orphaned `start-cluster`/`app_worker`/`agent_worker`
72+
processes can linger and hold ports (e.g. 17001), causing later runs to fail
73+
with `EADDRINUSE`/`app.ready()` timeouts. Kill them before re-running:
74+
75+
```bash
76+
ps aux | grep -E "[s]tart-cluster|[c]luster/src/(app|agent)_worker" \
77+
| awk '{print $2}' | xargs kill -9
78+
```
79+
4380
## Coding Conventions
4481

4582
- prefer existing repo patterns over inventing new ones

package.json

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,21 @@
1313
],
1414
"type": "module",
1515
"scripts": {
16-
"clean-dist": "ut run clean --workspaces --if-present",
16+
"clean-dist": "node scripts/clean-dist.js",
1717
"build": "tsdown",
1818
"prelint": "ut run clean-dist",
1919
"lint": "oxlint --type-aware --type-check --quiet",
2020
"fmt": "oxfmt",
21-
"typecheck": "ut run clean-dist && ut run typecheck --workspaces --if-present",
21+
"typecheck": "ut run clean-dist && node scripts/run-workspaces.js typecheck",
2222
"fmtcheck": "oxfmt --check .",
23-
"pretest": "ut run clean-dist && ut run pretest --workspaces --if-present",
23+
"pretest": "ut run clean-dist && node scripts/run-workspaces.js pretest",
2424
"test": "vitest run --bail 1 --retry 2 --testTimeout 20000 --hookTimeout 20000",
2525
"test:cov": "ut run test -- --coverage",
26+
"test:shard": "node scripts/run-shard.js",
2627
"benchmark:ci-test": "node scripts/ci-test-benchmark.js",
27-
"preci": "ut run pretest --workspaces --if-present",
28+
"preci": "ut run pretest",
2829
"ci": "ut run test -- --coverage",
30+
"ci:shard": "node scripts/run-shard.js --coverage",
2931
"dev:services:start": "node scripts/dev-services.js start",
3032
"dev:services:stop": "node scripts/dev-services.js stop",
3133
"dev:services:status": "node scripts/dev-services.js status",
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import { strict as assert } from 'node:assert';
2+
import { readFile } from 'node:fs/promises';
3+
import { scheduler } from 'node:timers/promises';
4+
5+
import { mm, type MockClusterApplication } from '@eggjs/mock';
6+
import { describe, it, beforeAll, afterAll } from 'vitest';
7+
8+
import { cluster, getFilepath } from './utils.ts';
9+
10+
// TODO: flaky test on windows, Hook timed out in 20000ms
11+
describe.skipIf(process.platform === 'win32')('test/agent_worker.test.ts > agent custom loggers', () => {
12+
let app: MockClusterApplication;
13+
14+
beforeAll(() => {
15+
app = cluster('apps/custom-logger');
16+
return app.ready();
17+
});
18+
afterAll(() => app.close());
19+
20+
// keep mm.restore behavior parity with original sibling describe
21+
afterAll(() => mm.restore());
22+
23+
it('should support custom logger in agent', async () => {
24+
await scheduler.wait(1500);
25+
const content = await readFile(getFilepath('apps/custom-logger/logs/monitor.log'), 'utf8');
26+
assert.match(content, /hello monitor!/);
27+
});
28+
});

0 commit comments

Comments
 (0)