Diag: Surface Windows crash exception codes via SEH filter #203
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: ci | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| jobs: | |
| build: | |
| name: build - ${{ matrix.os }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| # Pinned runner labels so a future GHA image bump doesn't | |
| # silently shift the matrix. `ubuntu-24.04-arm` is the GHA | |
| # ARM64 Linux tier (added 2024); together with `ubuntu-24.04` | |
| # we cover both Linux architectures natively. `macos-14` is | |
| # Apple Silicon. `windows-2022` is x86_64 Windows. | |
| os: [ubuntu-24.04, ubuntu-24.04-arm, macos-14, windows-2022] | |
| runs-on: ${{ matrix.os }} | |
| defaults: | |
| run: | |
| shell: bash | |
| env: | |
| CC: ${{ matrix.os == 'windows-2022' && 'gcc' || 'cc' }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Compiler version | |
| run: | | |
| $CC --version | |
| # On ubuntu-24.04 the alt-versions are also available; print | |
| # them so a future regression that's gcc-13-specific (or | |
| # 12-specific) is easier to triage. | |
| if [ "${{ matrix.os }}" = "ubuntu-24.04" ] || [ "${{ matrix.os }}" = "ubuntu-24.04-arm" ]; then | |
| for v in 12 13 14; do | |
| if command -v gcc-$v >/dev/null; then | |
| gcc-$v --version | head -1 | |
| fi | |
| done | |
| fi | |
| - name: Bootstrap mino | |
| # Generates the bundled-source headers and compiles ./mino in | |
| # one step. Anything beyond bootstrap belongs in `./mino task`. | |
| # Tee stderr so a build failure leaves a captured log for the | |
| # post-step summary below; the live step output stays | |
| # unchanged for anyone with log access. | |
| run: | | |
| set -o pipefail | |
| make 2>&1 | tee /tmp/build.log | |
| - name: Surface build failure | |
| # When the build step above fails, post the captured tail on | |
| # the job summary page so anyone with Actions UI access (incl. | |
| # signed-in external contributors) sees what broke without | |
| # having to download artifacts. | |
| if: failure() | |
| run: | | |
| { | |
| echo "## Build failure (${{ matrix.os }})" | |
| echo '' | |
| echo '```' | |
| tail -60 /tmp/build.log 2>/dev/null || echo '(no build log captured)' | |
| echo '```' | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| - name: Upload build log on failure | |
| # Artifacts are downloadable from the Actions page anonymously | |
| # for public repositories, so this is the path for off-repo | |
| # observers (and for the project bot) to see the actual gcc | |
| # error output without needing log-download permission. | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: build-log-${{ matrix.os }} | |
| path: /tmp/build.log | |
| retention-days: 7 | |
| - name: Test | |
| # Run the suite runner directly so stdout streams. `task test` | |
| # wraps the subprocess in sh!, which buffers output until exit; | |
| # under a hang, no diagnostic ever surfaces. | |
| # | |
| # MINO_TEST_TRACE=1 prints one stderr line per deftest before | |
| # it runs; combined with capturing stderr into a file and | |
| # tee'ing it to the job log, the last visible trace line | |
| # pinpoints any hanging test. The stderr capture is what makes | |
| # an opaque "Test timed out" actionable. Keeps trace off | |
| # locally (env-gated) so a normal `./mino tests/run.clj` | |
| # produces the same output as before. | |
| # | |
| # Watchdog wrapper: GHA's timeout-minutes sends a SIGKILL after | |
| # the cap, which gives no diagnostic at all on a hang. We | |
| # spawn mino in the background, sleep just inside the cap, | |
| # then SIGABRT it -- mino's crash_handler (main.c:711) prints | |
| # a backtrace + gc stats on SIGABRT, so a hang now leaves a | |
| # readable stack in the log instead of a silent kill. mino | |
| # exits non-zero after the dump, which fails the step | |
| # normally (no continue-on-error masking). | |
| env: | |
| MINO_TEST_TRACE: "1" | |
| run: | | |
| set +e | |
| # Pre-create + tail the trace file so its lines stream to | |
| # the live job log as mino emits them. Without the tail, | |
| # the trace only appears via the failure artifact, which | |
| # makes a live `gh run watch` opaque. | |
| : > /tmp/test_trace.log | |
| (tail -F /tmp/test_trace.log 2>/dev/null) & | |
| TAIL_PID=$! | |
| # exec replaces the subshell with mino so $! is mino's | |
| # pid directly -- the watchdog's kill -ABRT then lands | |
| # on mino, not on an outer shell wrapper. | |
| (exec ./mino tests/run.clj) 2> /tmp/test_trace.log & | |
| MINO_PID=$! | |
| # Wake at 7m30s (the cap is 8m) so SIGABRT has time to | |
| # run mino's handler before GHA's own SIGKILL lands. | |
| # mino's crash_handler (main.c:711) prints a backtrace + | |
| # GC stats on SIGABRT, so a hang now leaves a readable | |
| # stack in the log + trace artifact instead of a silent | |
| # kill. mino exits 134 (128 + SIGABRT) after the dump. | |
| (sleep 450; if kill -0 $MINO_PID 2>/dev/null; then | |
| echo "##[warning]Watchdog firing SIGABRT on hung mino (pid $MINO_PID)" | |
| kill -ABRT $MINO_PID | |
| fi) & | |
| WD_PID=$! | |
| wait $MINO_PID | |
| RC=$? | |
| # Give the trace tail a moment to flush mino's last lines. | |
| sleep 1 | |
| kill $WD_PID 2>/dev/null || true | |
| kill $TAIL_PID 2>/dev/null || true | |
| exit $RC | |
| # Tests usually finish in seconds; a hang means a deadlock, not | |
| # a slow runner. Cap so we get diagnostic output instead of | |
| # waiting on the 6h job-default timeout. The watchdog above | |
| # fires 30s before this cap so we keep the stack trace. | |
| timeout-minutes: 8 | |
| # The Windows test suite has documented divergence: cmd.exe's | |
| # echo emits a trailing space before \n, which the proc-test | |
| # assertions do not strip. Build still must pass; tests are | |
| # informational on Windows until the suite is portable. | |
| continue-on-error: ${{ matrix.os == 'windows-2022' }} | |
| - name: Upload test trace on failure | |
| # The trace captures each deftest entry; the last line shows | |
| # what was running when the timeout fired. Artifacts are | |
| # public on the run page so triage doesn't require log | |
| # download permission. | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: test-trace-${{ matrix.os }} | |
| path: /tmp/test_trace.log | |
| retention-days: 7 | |
| if-no-files-found: ignore | |
| - name: Release gate | |
| # The composite gate -- check-reloc-mirror, check-stencil- | |
| # registry, the test suite under ASan, and 4-way JIT parity | |
| # (auto / on / off / lean) -- runs on every non-Windows | |
| # matrix entry. Windows skips the gate because the gate's | |
| # ASan step needs a libsanitizer that mingw doesn't ship. | |
| # | |
| # `check-stencils-fresh` is intentionally NOT in the gate: | |
| # it regenerates stencils with the host `cc`, which means | |
| # the committed bytes have to byte-match whatever toolchain | |
| # the runner ships. Dev (Apple clang 17), macos-14 (Apple | |
| # clang 15), and ubuntu-24.04 (gcc, no musttail support) | |
| # diverge enough that the byte check is structurally | |
| # incompatible with the matrix. Stale-commit hygiene is a | |
| # dev pre-commit step; CI correctness is gated by the test | |
| # suite + ASan + 4-way parity, which catch the actual | |
| # runtime impact of a stale stencil regardless of compiler. | |
| # | |
| # Watchdog wrapper (mirrors the Test step). The gate spawns | |
| # several mino subprocesses -- the ASan suite and the 4-way | |
| # parity binaries -- so a stall here otherwise runs to GHA's | |
| # diagnostic-free SIGKILL with no stack. The arm64 runner has | |
| # stalled in this step intermittently. We background the gate, | |
| # sleep to just under the cap, then SIGABRT every live mino; | |
| # mino's crash_handler (main.c) dumps a backtrace + GC stats per | |
| # process on SIGABRT, so the next stall leaves a readable stack | |
| # in the log instead of a silent wait. pkill matches mino, | |
| # mino_asan, and the parity binaries by name, so whichever | |
| # process is hung dumps. SIGABRT makes mino exit 134, failing | |
| # the step normally. | |
| if: matrix.os != 'windows-2022' | |
| run: | | |
| set +e | |
| ./mino task release-gate & | |
| GATE_PID=$! | |
| (sleep 660; if kill -0 $GATE_PID 2>/dev/null; then | |
| echo "##[warning]Watchdog: release-gate exceeded 11m; SIGABRT mino for a backtrace" | |
| pkill -ABRT mino 2>/dev/null | |
| fi) & | |
| WD_PID=$! | |
| wait $GATE_PID | |
| RC=$? | |
| sleep 2 | |
| kill $WD_PID 2>/dev/null || true | |
| exit $RC | |
| timeout-minutes: 12 | |
| # Stencil determinism, re-armed. | |
| # | |
| # A previous `cross-compile` job regenerated each target's stencil | |
| # header on macos-14 and diffed against committed bytes. It was | |
| # removed when the dev / CI clang version split (Apple clang 17 | |
| # locally vs Apple clang 15 on macos-14, gcc on Linux) made | |
| # byte-identity infeasible across hosts. | |
| # | |
| # This job replaces it on a sound footing: ONE runner with ONE | |
| # pinned `zig cc` (the same toolchain `gen-stencils-all` uses | |
| # locally) cross-compiles every target, so the emitted bytes are | |
| # reproducible by construction. It must stay a single-host job -- | |
| # putting a byte-identity check back on the per-OS matrix is exactly | |
| # what broke the old one. The native matrix above remains the | |
| # portability canary (gcc + Apple clang + mingw); this job only | |
| # guards committed-byte freshness against the pinned toolchain. | |
| stencil-determinism: | |
| name: reproducibility - stencils | |
| runs-on: ubuntu-24.04 | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install pinned Zig | |
| # Version must match `zig-version-pin` in | |
| # lib/mino/tasks/builtin.clj. Zig is pre-1.0; a minor bump can | |
| # shift the bundled LLVM and therefore the emitted stencil | |
| # bytes, so bump both in lockstep and regenerate. | |
| uses: mlugg/setup-zig@v2 | |
| with: | |
| version: 0.16.0 | |
| - name: Bootstrap mino | |
| run: make | |
| - name: Check stencil freshness (all targets) | |
| # Regenerates all five committed headers via the pinned zig cc | |
| # and fails if `git diff` is non-empty -- i.e. somebody edited a | |
| # stencil source (or bumped the pin) without regenerating. | |
| run: ./mino task check-stencils-fresh-all | |
| timeout-minutes: 10 | |
| sanitize-zig: | |
| name: safety - sanitizers (UBSan+TSan) | |
| runs-on: ubuntu-24.04 | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install pinned Zig | |
| # Must match zig-version-pin in lib/mino/tasks/builtin.clj. | |
| uses: mlugg/setup-zig@v2 | |
| with: | |
| version: 0.16.0 | |
| - name: Bootstrap mino | |
| run: make | |
| - name: Reproducible UBSan + TSan suite (pinned compiler-rt) | |
| # Builds mino with the version-locked zig cc under UBSan and TSan | |
| # -- JIT-enabled for this host (jit-enable-flags) -- and runs the | |
| # full suite under each in AUTO and eager JIT mode. Additive to | |
| # the host ASan build in release-gate; ASan is absent because zig | |
| # ships no ASan runtime (that coverage stays on the host | |
| # toolchain). | |
| run: ./mino task sanitize-zig | |
| timeout-minutes: 25 | |
| lint-zig: | |
| name: safety - strict warnings | |
| runs-on: ubuntu-24.04 | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install pinned Zig | |
| # Must match zig-version-pin in lib/mino/tasks/builtin.clj. | |
| uses: mlugg/setup-zig@v2 | |
| with: | |
| version: 0.16.0 | |
| - name: Bootstrap mino | |
| run: make | |
| - name: Curated strict-warning lint (third compiler lens) | |
| # Compiles every mino-authored TU under -Werror over a curated | |
| # strict warning set with zig's newer clang -- catching issues | |
| # the gcc/Apple-clang matrix does not flag. Compile-only. | |
| run: ./mino task lint-zig | |
| timeout-minutes: 10 | |
| build-zig-hermetic: | |
| name: reproducibility - hermetic build | |
| runs-on: ubuntu-24.04 | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install pinned Zig | |
| # Must match zig-version-pin in lib/mino/tasks/builtin.clj. | |
| uses: mlugg/setup-zig@v2 | |
| with: | |
| version: 0.16.0 | |
| - name: Bootstrap mino | |
| run: make | |
| - name: Hermetic build + full suite (pinned zig cc, static musl) | |
| # Builds mino entirely with the version-locked zig cc (compiler, | |
| # libc, linker) and runs the full suite against it, so a green | |
| # build does not depend on the runner image's gcc/clang version. | |
| # Additive: the gcc/Apple-clang/mingw matrix stays the canary. | |
| run: ./mino task test-zig | |
| timeout-minutes: 15 | |
| binary-reproducible: | |
| name: reproducibility - byte-identical binary | |
| runs-on: ubuntu-24.04 | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install pinned Zig | |
| # Must match zig-version-pin in lib/mino/tasks/builtin.clj. | |
| uses: mlugg/setup-zig@v2 | |
| with: | |
| version: 0.16.0 | |
| - name: Bootstrap mino | |
| run: make | |
| - name: Reproducible-artifact gate (pinned zig cc) | |
| # Builds the published linux-amd64-musl artifact twice and asserts | |
| # it is byte-identical and embeds no builder-specific absolute path | |
| # ($PWD / $HOME) -- the property that lets a third party rebuild the | |
| # exact published bytes from the pinned toolchain + source. Like | |
| # stencil-determinism, this is single-host on purpose: it asserts a | |
| # property of one build environment, never compares across images. | |
| run: ./mino task check-binary-reproducible | |
| timeout-minutes: 15 | |
| darwin-zig-canary: | |
| name: canary - darwin (zig cc) | |
| runs-on: macos-14 | |
| # Informational while the zig-built darwin binary is evaluated: the | |
| # published darwin artifact stays Apple-clang native (release-build.yml) | |
| # until this proves out across releases. A failure surfaces as a signal, | |
| # not a merge block. | |
| continue-on-error: true | |
| # Bound the run: a local build + suite is ~2 min, so a hang (e.g. a | |
| # zig-built JIT deadlock) should trip this, not burn the 6-hour default. | |
| timeout-minutes: 20 | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install pinned Zig | |
| # Must match zig-version-pin in lib/mino/tasks/builtin.clj. | |
| uses: mlugg/setup-zig@v2 | |
| with: | |
| version: 0.16.0 | |
| - name: Build darwin standalone with native zig cc | |
| # macos-14 is arm64 with the system SDK present, so native zig cc | |
| # compiles + links the full JIT-enabled Mach-O -- unlike a Linux | |
| # host, where Zig bundles no macOS SDK and darwin cannot be cross- | |
| # built. This evaluates zig cc as the uniform standalone compiler | |
| # on every platform. The crash handler's _Unwind_* symbols come | |
| # from libSystem here (no -lunwind needed, as on glibc). | |
| # | |
| # CFLAGS mirrors the Makefile default plus -Wno-unused-but-set- | |
| # variable: zig ships a newer clang than Apple clang and flags a | |
| # set-but-unused `carry` in vendored src/vendor/imath/imath.c that | |
| # Apple clang does not. Same toolchain-specific suppression the | |
| # cross-build (cross-cflags) already carries for this file. | |
| run: | | |
| make CC="zig cc" CFLAGS="-std=c99 -Wall -Wpedantic -Wextra -Werror \ | |
| -Wno-missing-field-initializers -Wno-unknown-warning-option \ | |
| -Wno-clobbered -Wno-unused-but-set-variable -O2 -DMINO_CPJIT=1" | |
| - name: Smoke + full suite (JIT auto-enabled on arm64-darwin) | |
| # arm64-darwin is the one host that runtime-exercises the CPJIT, so | |
| # the suite under the zig-built binary validates a JIT-enabled | |
| # distributable Mach-O -- the gate the mandate requires before this | |
| # could become the published darwin build. | |
| run: | | |
| ./mino --version | |
| out=$(./mino -e '(+ 1 2)') | |
| [ "$out" = "3" ] || { echo "darwin-zig smoke failed: got '$out'" >&2; exit 1; } | |
| ./mino tests/run.clj | |
| - name: Full suite with JIT forced on | |
| run: ./mino --jit=on tests/run.clj | |
| - name: Verify Mach-O TLV binding (zig linker tripwire) | |
| # The zig 0.16.0 self-hosted Mach-O linker mis-binds the 2nd+ | |
| # __thread_vars descriptor thunks to a wrong import (observed | |
| # ___clear_cache / _printf instead of __tlv_bootstrap). It is | |
| # HARMLESS on macOS because dyld4 rewrites every descriptor in | |
| # the __thread_vars section to _tlv_get_addr at load, | |
| # regardless of the symbol the linker bound -- which is exactly | |
| # why the full suite above (mino leans on __thread for | |
| # mino_tls_ctx / cancel_ptr / safepoint_count) passes. This | |
| # step is the static tripwire: it asserts the TLV section | |
| # exists with the bootstrap symbol present, and flags any | |
| # mis-bind target OUTSIDE the known-benign set so a future zig | |
| # bump that changes the pattern surfaces here before the | |
| # darwin artifact is ever flipped to a zig build. See | |
| # docs/MAINTAINER_TOOLCHAIN.md (darwin TLV note) + the upstream | |
| # issue draft in .local/. | |
| run: | | |
| binds=$(dyld_info -fixups ./mino 2>/dev/null \ | |
| | awk '/__thread_vars/ {print $NF}') | |
| echo "__thread_vars binds:" | |
| echo "$binds" | sed 's/^/ /' | |
| if [ -z "$binds" ]; then | |
| echo "TLV tripwire: no __thread_vars descriptors found" >&2 | |
| exit 1 | |
| fi | |
| if ! echo "$binds" | grep -q '__tlv_bootstrap'; then | |
| echo "TLV tripwire: __tlv_bootstrap absent -- TLV machinery missing" >&2 | |
| exit 1 | |
| fi | |
| # Known-benign bind targets: the correct bootstrap plus the | |
| # symbols the zig 0.16.0 mis-bind has been observed to pick | |
| # (all in the __thread_vars section dyld rewrites wholesale). | |
| unexpected=$(echo "$binds" \ | |
| | grep -vE '(__tlv_bootstrap|___clear_cache|_printf)$' || true) | |
| if [ -n "$unexpected" ]; then | |
| echo "TLV tripwire: unexpected __thread_vars bind target(s):" >&2 | |
| echo "$unexpected" | sed 's/^/ /' >&2 | |
| echo "A zig bump changed the mis-bind pattern; re-verify TLV" >&2 | |
| echo "correctness and update the allowlist before trusting it." >&2 | |
| exit 1 | |
| fi | |
| echo "TLV tripwire: OK (bootstrap present; mis-binds within known-benign set)" | |
| # Per-host JIT runtime canaries. Four of the five committed JIT | |
| # pipelines (ELF arm64/x86_64, COFF x86_64, Mach-O x86_64) sit | |
| # behind opt-in defines no published artifact sets; these lanes are | |
| # their only runtime exercise. Each runner builds the pipeline its | |
| # hardware can execute (macos-14 cross-builds Mach-O x86_64 and runs | |
| # it under Rosetta 2) plus a lean twin, then runs the full suite in | |
| # AUTO and eager mode and the four-way parity check -- all inside | |
| # `./mino task test-jit-host`. Informational while the lanes build a | |
| # green streak; promotion to blocking is the gate for ever enabling | |
| # the JIT in the published artifacts for these hosts. | |
| jit-host-canary: | |
| name: canary - jit-host (${{ matrix.os }}) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| os: [ubuntu-24.04, ubuntu-24.04-arm, macos-14, windows-2022] | |
| runs-on: ${{ matrix.os }} | |
| continue-on-error: true | |
| # A local run is ~25s on arm64 darwin; CI runners are slower and | |
| # windows slower still, but a hang (the failure mode JIT bugs | |
| # produce) should trip this cap, not the 6-hour default. | |
| timeout-minutes: 25 | |
| defaults: | |
| run: | |
| shell: bash | |
| env: | |
| CC: ${{ matrix.os == 'windows-2022' && 'gcc' || 'cc' }} | |
| # This canary runs the suite under a cross-compiled or emulated | |
| # binary (Rosetta x86_64-on-arm64 on macos-14, a zig-gnu build on | |
| # windows) on shared runners, where the JIT loop-cancel handshake | |
| # runs several times slower than native. The flag widens only the | |
| # cancellation-latency budget in jit_invalidation_test; the strict | |
| # bound stays enforced by the blocking native matrix, which runs | |
| # the same suite without it. | |
| MINO_SLOW_HOST: "1" | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install pinned Zig | |
| # Must match zig-version-pin in lib/mino/tasks/builtin.clj. | |
| uses: mlugg/setup-zig@v2 | |
| with: | |
| version: 0.16.0 | |
| - name: Bootstrap mino | |
| run: make | |
| - name: Per-host JIT canary (suite auto + eager, 4-way parity) | |
| run: ./mino task test-jit-host | |
| # Portability canary: compile the amalgamated single-file source with | |
| # MSVC's C frontend -- the one toolchain neither gcc (Linux/Windows | |
| # matrix) nor clang (macOS matrix, zig) represents. continue-on-error | |
| # so an MSVC-specific C99 gap surfaces as an informational signal, not | |
| # a merge block, until the amalgam is vetted MSVC-clean. This ADDS a | |
| # compiler to CI without removing any -- the gcc/clang/mingw matrix | |
| # stays the gating portability signal. | |
| msvc-compile-canary: | |
| name: canary - msvc (cl.exe) | |
| runs-on: windows-latest | |
| continue-on-error: true | |
| env: | |
| CC: gcc | |
| defaults: | |
| run: | |
| shell: bash | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Bootstrap mino (mingw gcc) | |
| run: make | |
| - name: Produce amalgamation | |
| run: ./mino task amalgamate | |
| - name: Set up MSVC environment | |
| uses: ilammy/msvc-dev-cmd@v1 | |
| - name: Compile amalgam with cl.exe /TC (compile-only) | |
| # /TC forces C (not C++); /c is compile-only -- we're checking | |
| # the C frontend accepts the source, not producing a binary. | |
| shell: cmd | |
| run: cl /TC /c /I dist dist\mino.c |