aver/.github/workflows/ci.yml at 53dcaddaee2ae765a96d776d8506178ca98ca663 · jasisz/aver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
name: CI

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
  # Nightly deep sweep against main with the heavy proptest budget: the
  # WASM job's cross-backend differential runs 2000 cases nightly but a
  # 64-case smoke per-PR (see its `PROPTEST_CASES` below). Mirrors the
  # fuzz / proof / rust-codegen nightlies (cron 03:00 UTC). Manual runs
  # via workflow_dispatch also get the full 2000-case sweep.
  schedule:
    - cron: '0 3 * * *'
  workflow_dispatch:

env:
  CARGO_TERM_COLOR: always

jobs:
  check:
    name: Check & Test
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5

      - uses: dtolnay/rust-toolchain@stable
        with:
          components: clippy, rustfmt

      - uses: Swatinem/rust-cache@v2

      - name: Format
        run: cargo fmt --all -- --check

      - name: Clippy
        run: cargo clippy --workspace --all-targets --exclude aver-lang -- -D warnings && cargo clippy -p aver-lang --lib --bin aver -- -D warnings

      - name: Compile gate — terminal-off feature set
        # The default build has `terminal` on, so `#[cfg(not(feature =
        # "terminal"))]` branches (the playground/wasm32 and fuzz-mutator
        # stubs) never compile in the normal gates — a whole class of
        # regression that only surfaced in the nightly Fuzz job (which
        # builds aver-lang with `terminal` off). The Int=ℤ migration hit
        # exactly this: a `Value::Int(<raw i64>)` in the terminal-off
        # `Terminal.Size` stub broke 7/8 fuzz targets while every PR
        # stayed green. Checking the fuzz mutator manifest reproduces the
        # exact terminal-off feature resolution (aver-lang with
        # `default-features = false, features = ["runtime"]`) in ~17s, so
        # this class now gates on PRs instead of the morning after.
        run: cargo check --manifest-path fuzz/mutator/Cargo.toml

      - name: Test
        # Iron — Stage 4: CI exercises proptest sweeps at 2 000 cases
        # per property instead of the 256-case default. The matcher
        # invariants (`types::checker::tests`) and replay-codec
        # round-trip (`tests/replay_proptest.rs`) each surfaced real
        # bugs inside 256 cases; bumping the budget here lets CI sweep
        # ~8× deeper input space than a developer's local cycle does,
        # surfacing shapes that only fire at low probability. Local
        # `cargo test` keeps the 256 default for fast iteration. 2 000
        # is conservative — 10 000 was the first instinct but risked
        # tipping the Check & Test runtime past the rest of the gates;
        # the per-test runtime gap of 256 vs 2 000 is in the seconds,
        # 2 000 vs 10 000 is in the minutes for the heavier codec
        # roundtrip tests. Bump again if a class of bug slips at 2 000.
        env:
          PROPTEST_CASES: '2000'
        run: cargo test --workspace

      - name: Check shipped examples
        # Gate the shipped example corpus: a merge that breaks an
        # example under `aver check` must fail CI, not ship silently.
        # `cargo test --workspace` above already compiled the debug
        # `aver` binary (the CLI integration tests run it via
        # CARGO_BIN_EXE_aver), so `cargo run` here is a cache hit, not
        # a rebuild. Only `examples/core` and `examples/data` are
        # gated — `examples/diagnostics` contains deliberately broken
        # files (lint/error demos) and stays ungated. Bare exit codes:
        # each command's status fails the step directly, no pipes.
        run: |
          cargo run --bin aver -- check examples/core
          cargo run --bin aver -- check examples/data

  wasm:
    name: WASM
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5

      - uses: dtolnay/rust-toolchain@stable
        with:
          components: clippy

      - uses: Swatinem/rust-cache@v2

      - name: Install WASM tools
        env:
          BINARYEN_VERSION: version_129
        run: |
          curl -fsSL "https://github.com/WebAssembly/binaryen/releases/download/${BINARYEN_VERSION}/binaryen-${BINARYEN_VERSION}-x86_64-linux.tar.gz" -o binaryen.tar.gz
          tar -xzf binaryen.tar.gz
          echo "$PWD/binaryen-${BINARYEN_VERSION}/bin" >> "$GITHUB_PATH"
          test -x "$PWD/binaryen-${BINARYEN_VERSION}/bin/wasm-merge"
          test -x "$PWD/binaryen-${BINARYEN_VERSION}/bin/wasm-opt"
          test -x "$PWD/binaryen-${BINARYEN_VERSION}/bin/wasm-metadce"

      - name: Clippy (wasm + wasip2)
        run: cargo clippy -p aver-lang --lib --bin aver --features wasm,wasip2 -- -D warnings

      - name: Test (wasm + wasip2)
        # `wasip2` is a separate feature; `wasm` alone leaves every
        # `#![cfg(feature = "wasip2")]` test file at 0 tests, so the
        # wasip2 codegen / component-model surface went unchecked by the
        # gate. Enable both (they share `wasm-compile`) so the wasip2_*
        # suites run.
        #
        # Proptest budget — split per-PR vs nightly. The wasm-feature
        # proptests include the cross-backend DIFFERENTIAL sweeps
        # (`tests/cross_backend_proptest.rs`), which spawn `aver run` twice
        # per case (VM + wasm-gc) and JIT each module in wasmtime —
        # ~0.5-1.5s/case, vs microseconds for the in-process proptests in
        # the Check & Test job. At 2000 cases that is ~30 min, too slow for
        # every PR. So: a 64-case SMOKE per-PR, and the full 2000-case
        # sweep NIGHTLY (and on workflow_dispatch). The deterministic
        # differentials (sign matrices, the Euclidean law, canonical-eq,
        # interpolation render) run at ANY case count, so they stay the
        # real per-PR net; the 2000-case sweep is the broad randomized
        # net, alongside the fuzz nightly's `fuzz_parity_vm_vs_wasm_gc`
        # in-process differential.
        env:
          PROPTEST_CASES: ${{ (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && '2000' || '64' }}
        run: cargo test -p aver-lang --features wasm,wasip2

  wasm_release_build:
    # Post-merge release-build verification under `--features wasm`.
    # PR-time CI runs `cargo test --features wasm` (debug profile) in
    # the `wasm` job above — that catches every logic regression. The
    # release build only matters when the artifact actually ships
    # (post-merge to `main`), and dragging it onto PR-time CI added
    # 3–5 min of wait without ever surfacing a release-only bug in
    # practice. If a release-mode regression slips in, a hot follow-up
    # PR fixes it from main; PR review unblocks immediately.
    name: WASM release build (main)
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    runs-on: ubuntu-latest
    needs: [check, wasm]
    steps:
      - uses: actions/checkout@v5
      - uses: dtolnay/rust-toolchain@stable
      - uses: Swatinem/rust-cache@v2
      - name: Build (wasm release)
        run: cargo build -p aver-lang --features wasm --release

  bench:
    # Post-merge bench gate. PR-time CI doesn't run this — perf
    # regressions get caught when they hit `main`, then a follow-up
    # PR reverts or fixes them. Trade chosen deliberately: PR review
    # unblocks 3–5 min faster, and the fix-forward latency for a perf
    # regression on a pre-1.0 compiler is acceptable. Locally,
    # contributors can still run `aver bench bench/scenarios/
    # --target=vm` against their branch before opening a PR if they
    # touched a perf-sensitive path.
    name: Bench Gate
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v5

      - uses: dtolnay/rust-toolchain@stable

      - uses: Swatinem/rust-cache@v2

      - name: Build aver (release)
        run: cargo build --release --bin aver

      - name: Run bench scenarios
        # `--baseline-dir bench/baselines/` auto-picks the file matching
        # this runner's host (`<os>-<arch>-vm.json`). When no matching
        # baseline exists, the bench run still produces reports but the
        # gate is silently skipped — first run on a new host won't fail
        # the build, just won't gate. To pin a Linux baseline, run
        # `aver bench bench/scenarios/ --target=vm --json --save-baseline
        # bench/baselines/linux-x86_64-vm.json` on this runner and
        # commit the result.
        run: |
          ./target/release/aver bench bench/scenarios/ \
            --target=vm \
            --baseline-dir bench/baselines/ \
            --fail-on-regression \
            --json | tee bench-results.ndjson

      - name: Upload bench results
        if: always()
        uses: actions/upload-artifact@v6
        with:
          name: bench-results
          path: bench-results.ndjson
          retention-days: 30