eggjs
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 101 additions & 18 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 101 additions & 18 deletions
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 39 additions & 2 deletions b/‎AGENTS.md‎
Lines changed: 39 additions & 2 deletions
diff --git a/‎package.json‎
Lines changed: 6 additions & 4 deletions b/‎package.json‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎packages/cluster/test/agent_custom_logger.test.ts‎
Lines changed: 28 additions & 0 deletions b/‎packages/cluster/test/agent_custom_logger.test.ts‎
Lines changed: 28 additions & 0 deletions
@@ -18,11 +18,20 @@ permissions:
   actions: write
 
 jobs:
-  typecheck:
+  # Static-analysis / build checks. Each check runs as its own parallel job
+  # (instead of serial steps) so the slowest one — not their sum — bounds the
+  # wall time. Goal: keep each ≤ 60s.
+  quality:
+    strategy:
+      fail-fast: false
+      matrix:
+        check: ['lint', 'typecheck', 'fmtcheck', 'build', 'site']
+
+    name: Quality (${{ matrix.check }})
     runs-on: ubuntu-latest
 
     concurrency:
-      group: typecheck-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}
+      group: quality-${{ matrix.check }}-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}
       cancel-in-progress: true
     steps:
       - name: Checkout repository
@@ -41,32 +50,86 @@ jobs:
         run: ut install --from pnpm || (sleep 5 && ut install --from pnpm) || (sleep 10 && ut install --from pnpm)
 
       - name: Run lint
+        if: ${{ matrix.check == 'lint' }}
         run: ut run lint
 
       - name: Run typecheck
+        if: ${{ matrix.check == 'typecheck' }}
         run: ut run typecheck
 
       - name: Run format check
+        if: ${{ matrix.check == 'fmtcheck' }}
         run: ut run fmtcheck
 
       - name: Run build
+        if: ${{ matrix.check == 'build' }}
         run: ut run build
 
       - name: Run site build
+        if: ${{ matrix.check == 'site' }}
         run: ut run site:build
 
   test:
     strategy:
       fail-fast: false
       matrix:
-        os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
+        # ubuntu-latest runs the fine-grained shards: fork-heavy packages are
+        # split into vitest --shard slices and the light remainder into
+        # weight-balanced rest-* shards (see scripts/run-shard.js). Sized so each
+        # shard's `vitest run` step stays ≤ ~60s on ubuntu-latest, where runner
+        # concurrency is high enough to run them all in parallel.
+        os: ['ubuntu-latest']
         node: ['22', '24']
-
-    name: Test (${{ matrix.os }}, ${{ matrix.node }})
+        shard:
+          - cluster-1
+          - cluster-2
+          - cluster-3
+          - egg-1
+          - egg-2
+          - mock-1
+          - mock-2
+          - schedule-1
+          - schedule-2
+          - schedule-3
+          - schedule-4
+          - development
+          - security-1
+          - security-2
+          - redis
+          - multipart
+          - rest-1
+          - rest-2
+          - rest-3
+          - rest-4
+          - rest-5
+          - rest-6
+          - rest-7
+          - rest-8
+        include:
+          # macOS/Windows runners have very low concurrency (~5 macOS slots), so
+          # fanning out 22 shards × 2 nodes would queue ~9 batches and blow up
+          # wall time. Run the whole suite as a single job per (os, node) instead
+          # — coverage without flooding the queue. (shard=all => no filter.)
+          # These whole-suite jobs take ~13-25min and dominate the overall
+          # workflow wall; the ≤60s-per-shard target applies to the ubuntu fan-out.
+          - os: macos-latest
+            node: '22'
+            shard: all
+          - os: macos-latest
+            node: '24'
+            shard: all
+          - os: windows-latest
+            node: '22'
+            shard: all
+          - os: windows-latest
+            node: '24'
+            shard: all
+
+    name: Test (${{ matrix.os }}, ${{ matrix.node }}, ${{ matrix.shard }})
     runs-on: ${{ matrix.os }}
 
     concurrency:
-      group: test-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}-(${{ matrix.os }}, ${{ matrix.node }})
+      group: test-${{ github.workflow }}-#${{ github.event.pull_request.number || github.head_ref || github.ref }}-(${{ matrix.os }}, ${{ matrix.node }}, ${{ matrix.shard }})
       cancel-in-progress: true
 
     steps:
@@ -163,11 +226,29 @@ jobs:
       - name: Install dependencies
         run: ut install --from pnpm || (sleep 5 && ut install --from pnpm) || (sleep 10 && ut install --from pnpm)
 
-      - name: Run tests
-        run: ut run ci
+      - name: Prepare test fixtures (clean dist + db)
+        # Mirrors the original `preci` (pretest) step: clears stale dist so tegg
+        # plugin tests don't double-load src+dist, and runs per-workspace
+        # pretest (e.g. orm DB table init). The shard runner invokes vitest
+        # directly, bypassing the npm `preci` lifecycle, so do it explicitly.
+        run: ut run pretest
+
+      - name: Set HOME on Windows
+        # Newer windows-latest runner images leave %HOME% unset, while
+        # @eggjs/core's getHomedir() falls back to os.homedir()
+        # (C:\Users\runneradmin). egg_loader.test.ts asserts
+        # getHomedir() === process.env.HOME, so align HOME with USERPROFILE the
+        # way older runner images did. Unix runners already export HOME.
+        if: ${{ matrix.os == 'windows-latest' }}
+        shell: pwsh
+        run: echo "HOME=$env:USERPROFILE" >> $env:GITHUB_ENV
+
+      - name: Run tests (shard ${{ matrix.shard }})
+        run: node scripts/run-shard.js ${{ matrix.shard }} -- --coverage
 
       - name: Run example tests
-        if: ${{ matrix.os != 'windows-latest' }}
+        # Only run example tests once per (os, node) combo to avoid duplication.
+        if: ${{ matrix.os != 'windows-latest' && matrix.shard == 'rest-1' }}
         run: |
           ut run example:test:all
 
@@ -207,17 +288,19 @@ jobs:
       - name: Install dependencies
         run: ut install --from pnpm || (sleep 5 && ut install --from pnpm) || (sleep 10 && ut install --from pnpm)
 
+      - name: Set HOME on Windows
+        if: ${{ matrix.os == 'windows-latest' }}
+        shell: pwsh
+        run: echo "HOME=$env:USERPROFILE" >> $env:GITHUB_ENV
+
       - name: Run tests
+        # `ci` (vitest --coverage) adds ~50s of instrumentation on egg-bin's
+        # fork-heavy suite, pushing it past 60s. Run without coverage (`test`)
+        # to keep the job ≤60s; egg-bin's fork tests can't be sharded (they
+        # depend on cross-file shared state under isolate:false).
         run: |
           ut run build -- --workspace ./tools/egg-bin
-          ut run ci --workspace @eggjs/bin
-
-      - name: Code Coverage
-        # skip on windows, it will hangup on codecov https://github.com/codecov/codecov-action/issues/1787
-        if: ${{ matrix.os != 'windows-latest' }}
-        uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de # v5
-        with:
-          use_oidc: true
+          ut run test --workspace @eggjs/bin
 
   test-egg-scripts:
     strategy:
@@ -265,7 +348,7 @@ jobs:
       - test
       - test-egg-bin
       - test-egg-scripts
-      - typecheck
+      - quality
     steps:
       - run: exit 1
         if: ${{ always() && (contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled')) }}
@@ -120,3 +120,8 @@ ecosystem-ci/examples
 pnpm-lock.yaml
 .utoo.toml
 .claude/
+
+# benchmark output directories
+benchmark/ci-test/baseline*
+benchmark/ci-test/run*
+benchmark/ci-test/2*
@@ -28,18 +28,55 @@ Egg is maintained as a utoo monorepo.
 
 ### Local CI
 
-Run tests **without building first**. The CI workflow (`ut install → ut run ci`) never runs `build` before tests. If `dist/` directories exist from a prior build, tegg plugin tests will fail with `duplicate proto` errors because globby scans both `src/*.ts` and `dist/*.js`, loading the same decorated class twice.
+Run tests **without building first**. The CI test jobs run `ut run pretest`
+(clean dist + per-workspace pretest) then a vitest shard; they never `build`
+before tests. If `dist/` directories exist from a prior build, tegg plugin tests
+will fail with `duplicate proto` errors because globby scans both `src/*.ts` and
+`dist/*.js`, loading the same decorated class twice. `scripts/clean-dist.js`
+(run by `ut run clean-dist`) removes every `dist/` for you.
 
-When you see `duplicate proto` failures locally:
+When you see `duplicate proto` failures locally, run `ut run clean-dist` (or the
+equivalent find below) and re-run:
 
 ```bash
 find tegg packages plugins tools -name dist -type d \
   -not -path '*/node_modules/*' -not -path '*/test/*' -not -path '*/fixtures/*' \
   -exec rm -rf {} +
 ```
 
+### Aggregator scripts (avoid `ut run <x> --workspaces` recursion)
+
+The root `typecheck` / `pretest` aggregate per-workspace scripts via
+`node scripts/run-workspaces.js <script>`, **not** `ut run <script> --workspaces`.
+With utoo, `--workspaces` includes the monorepo root, so a root script that calls
+`ut run <same-name> --workspaces` recurses infinitely. `run-workspaces.js`
+enumerates real workspace dirs, runs each script body against the root-hoisted
+`node_modules/.bin` (so root-only CLIs like `tsgo` resolve even where a stale
+local bin shim exists), and runs them concurrently to bound wall time.
+
 Then re-run tests.
 
+### CI test sharding
+
+CI splits the test suite across parallel runners via `scripts/run-shard.js`
+(`node scripts/run-shard.js <shard>`). Each heavy fork-based package
+(`cluster`, `egg`, `mock`, `development`, `schedule-a`/`schedule-b`) runs on its
+own runner with a `--maxWorkers` cap so forked egg cluster child processes do
+not oversubscribe the CPU and time out; `rest-a`/`rest-b` split everything else
+with full parallelism. The goal is to keep each shard's `vitest run` wall time
+under ~60s. To reproduce one shard locally: `node scripts/run-shard.js cluster`.
+Use `node scripts/run-shard.js all` for the unsharded full suite.
+
+Cluster/mock/schedule tests fork real OS processes that bind ports. If a run is
+killed mid-flight, orphaned `start-cluster`/`app_worker`/`agent_worker`
+processes can linger and hold ports (e.g. 17001), causing later runs to fail
+with `EADDRINUSE`/`app.ready()` timeouts. Kill them before re-running:
+
+```bash
+ps aux | grep -E "[s]tart-cluster|[c]luster/src/(app|agent)_worker" \
+  | awk '{print $2}' | xargs kill -9
+```
+
 ## Coding Conventions
 
 - prefer existing repo patterns over inventing new ones
 
@@ -13,19 +13,21 @@
   ],
   "type": "module",
   "scripts": {
-    "clean-dist": "ut run clean --workspaces --if-present",
+    "clean-dist": "node scripts/clean-dist.js",
     "build": "tsdown",
     "prelint": "ut run clean-dist",
     "lint": "oxlint --type-aware --type-check --quiet",
     "fmt": "oxfmt",
-    "typecheck": "ut run clean-dist && ut run typecheck --workspaces --if-present",
+    "typecheck": "ut run clean-dist && node scripts/run-workspaces.js typecheck",
     "fmtcheck": "oxfmt --check .",
-    "pretest": "ut run clean-dist && ut run pretest --workspaces --if-present",
+    "pretest": "ut run clean-dist && node scripts/run-workspaces.js pretest",
     "test": "vitest run --bail 1 --retry 2 --testTimeout 20000 --hookTimeout 20000",
     "test:cov": "ut run test -- --coverage",
+    "test:shard": "node scripts/run-shard.js",
     "benchmark:ci-test": "node scripts/ci-test-benchmark.js",
-    "preci": "ut run pretest --workspaces --if-present",
+    "preci": "ut run pretest",
     "ci": "ut run test -- --coverage",
+    "ci:shard": "node scripts/run-shard.js --coverage",
     "dev:services:start": "node scripts/dev-services.js start",
     "dev:services:stop": "node scripts/dev-services.js stop",
     "dev:services:status": "node scripts/dev-services.js status",
 
@@ -0,0 +1,28 @@
+import { strict as assert } from 'node:assert';
+import { readFile } from 'node:fs/promises';
+import { scheduler } from 'node:timers/promises';
+
+import { mm, type MockClusterApplication } from '@eggjs/mock';
+import { describe, it, beforeAll, afterAll } from 'vitest';
+
+import { cluster, getFilepath } from './utils.ts';
+
+// TODO: flaky test on windows, Hook timed out in 20000ms
+describe.skipIf(process.platform === 'win32')('test/agent_worker.test.ts > agent custom loggers', () => {
+  let app: MockClusterApplication;
+
+  beforeAll(() => {
+    app = cluster('apps/custom-logger');
+    return app.ready();
+  });
+  afterAll(() => app.close());
+
+  // keep mm.restore behavior parity with original sibling describe
+  afterAll(() => mm.restore());
+
+  it('should support custom logger in agent', async () => {
+    await scheduler.wait(1500);
+    const content = await readFile(getFilepath('apps/custom-logger/logs/monitor.log'), 'utf8');
+    assert.match(content, /hello monitor!/);
+  });
+});