osaurus/.github/workflows/ci.yml at 63b62ce87cfd8502be8b2bd341665ce0ac68359e · osaurus-ai/osaurus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
name: CI

on:
  pull_request:
  push:
    branches: [main]
  workflow_dispatch:
    inputs:
      clear_cache:
        # Re-runs of a failed job already force a cold build automatically
        # (see "Wipe restored caches" step). This input is for the rare case
        # where you want to start a brand-new run with a cold build — e.g.
        # after a CACHE_SALT bump on `main` to verify the cold path before
        # PR runs hit it.
        description: "Force a cold build on the FIRST attempt (re-runs are already cold)"
        type: boolean
        default: false

concurrency:
  group: ci-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read

env:
  # Bump to invalidate every cache entry without source surgery (e.g., after a
  # known-bad cache or an Xcode toolchain upgrade we want to flush manually).
  CACHE_SALT: v3-pr-cold-deriveddata
  # Pin Xcode so cache keys are stable across runner image bumps. When you
  # need to upgrade, change here AND in setup-xcode below.
  XCODE_VERSION: "26.4.1"

jobs:
  test-core:
    # Pinned (was `macos-latest`) so a runner image bump can't quietly
    # change the build environment under us.
    runs-on: macos-26
    # 45 min, not 30. Three-bucket budget:
    #   ~25–30 min — full cold build (mlx-swift `Cmlx` C++ + SQLCipher
    #                 `sqlite3.c` ~250k LoC of C compiled with
    #                 `-DSQLITE_HAS_CODEC=1` and friends + OsaurusCore +
    #                 OsaurusCoreTests Swift) when both SPM and DerivedData
    #                 caches miss. PR #951 (run 24937664669, attempt 2)
    #                 hit the prior 30-min wall mid-Swift-compile after
    #                 27:27 in the xcodebuild step — that's the empirical
    #                 floor on `macos-26` with the SQLCipher amalgamation.
    #   ~ 2– 3 min — actual `xcodebuild test` once the build is warm.
    #   ~10–15 min — buffer / future growth, runner variance.
    #
    # Once any successful run lands on `main`, the `Save DerivedData cache`
    # step at the bottom populates the cache and subsequent runs return to
    # ~5 min total. The 45-min ceiling is an "even a worst-case cold build
    # finishes" guard, NOT an expected duration. If you find yourself
    # raising it again, the right fix is to split this into a separate
    # build-cache-warm job that runs nightly on `main`, not to bump the
    # ceiling indefinitely.
    timeout-minutes: 45
    env:
      WORKSPACE: osaurus.xcworkspace
      SPM_CACHE: .spm-cache
      XCRESULT_PATH: build/Tests.xcresult
    steps:
      - name: Checkout code
        uses: actions/checkout@v5

      - name: Set up Xcode ${{ env.XCODE_VERSION }}
        uses: maxim-lobanov/setup-xcode@v1
        with:
          xcode-version: ${{ env.XCODE_VERSION }}

      - name: Install xcbeautify
        run: brew install xcbeautify

      - name: Cache SPM packages
        uses: actions/cache@v5
        with:
          path: ${{ env.SPM_CACHE }}
          key: spm-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}-${{ hashFiles('osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved') }}
          restore-keys: |
            spm-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}-
            spm-${{ runner.os }}-${{ env.CACHE_SALT }}-

      - name: Restore DerivedData cache
        id: dd-cache
        # Restore only on main pushes / manual maintainer runs. Pull requests
        # intentionally cold-build DerivedData: restore-key hits have produced
        # stale Swift modules whose C-module dependencies are missing when
        # Xcode later compiles EventSource.
        if: ${{ github.event_name != 'pull_request' }}
        uses: actions/cache/restore@v5
        with:
          path: ~/Library/Developer/Xcode/DerivedData
          # Include vendored C sources (currently the SQLCipher amalgamation
          # under Packages/OsaurusCore/SQLCipher/). Without this, an
          # SQLCipher bump would land its new sqlite3.{c,h} but CI would
          # silently re-use a stale cached compile of the old code.
          key: dd-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}-${{ hashFiles('osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved', 'Packages/**/*.swift', 'Packages/**/Package.swift', 'Packages/**/Resources/**', 'Packages/**/*.c', 'Packages/**/*.h') }}
          restore-keys: |
            dd-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}-

      # Make "clear the build cache" a one-click operation. Three triggers:
      #   1. Pull requests — always cold-build DerivedData so PRs never trust
      #      a cached Xcode build product from another ref.
      #   2. `github.run_attempt != '1'` — i.e. a re-run. The default
      #      "Re-run failed jobs" button is the natural place for someone
      #      who just saw a build failure to land, so we make that the
      #      intuitive escape hatch for cache poison: the first attempt
      #      uses the cache (fast); any re-run forces a cold compile.
      #   3. `workflow_dispatch.clear_cache=true` — manual force-cold on
      #      a fresh run (e.g. validating a CACHE_SALT bump before PRs
      #      start hitting it).
      #
      # We wipe ONLY DerivedData, not the SPM cache. DerivedData holds
      # compiled object files / .swiftmodule / linked binaries — the
      # actual build outputs that can carry over a stale-source bug across
      # incremental builds. The SPM cache is just downloaded source code
      # pinned by `Package.resolved` checksums; it can't be "poisoned" in
      # any way that affects build correctness, and re-downloading it on
      # every re-run cost ~2 min in PR #951 run 24937664669 — wasted
      # budget that contributed to the 30-min cold-build cancellation.
      #
      # On main/manual runs we wipe AFTER the restore step (rather than
      # skipping the restore) so `steps.dd-cache.outputs.cache-primary-key`
      # stays populated and the `Save DerivedData cache` step at the bottom
      # can still repopulate the cache on a successful `main` run.
      - name: Wipe restored DerivedData (PR, re-run, or workflow_dispatch clear_cache)
        if: ${{ github.event_name == 'pull_request' || github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }}
        run: |
          REASON="event=${{ github.event_name }}, run_attempt=${{ github.run_attempt }}"
          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.clear_cache }}" = "true" ]; then
            REASON="$REASON, workflow_dispatch clear_cache=true"
          fi
          echo "::notice title=Cold build forced::Wiping DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved)."
          rm -rf "$HOME/Library/Developer/Xcode/DerivedData"

      - name: Resolve dependencies
        run: >-
          xcodebuild -resolvePackageDependencies
          -workspace "$WORKSPACE"
          -scheme OsaurusCoreTests
          -clonedSourcePackagesDirPath "$SPM_CACHE"
          -quiet

      - name: Test OsaurusCore
        id: test
        run: |
          set -o pipefail
          mkdir -p build
          # We deliberately pass a dummy resultBundlePath check first because
          # xcodebuild refuses to overwrite an existing bundle.
          rm -rf "$XCRESULT_PATH"
          # Per-test-case allowance (60s / 120s hard cap) surfaces a hung
          # test BEFORE the job wall-timeout — and crucially, surfaces it
          # WITH A NAME ATTACHED in the xcresult bundle, which the failure
          # summary step below relies on.
          #
          # Why no `--quiet` on xcbeautify and no `-test-iterations`:
          # both were band-aids that landed during PR #878's launch-hang
          # incident. `--quiet` strips per-test "Started/Passed" lines so a
          # hung test produces no output telling us WHICH test hung.
          # `-test-iterations 2 -retry-tests-on-failure` doubles the
          # wall-clock cost of every hang for zero diagnostic gain. Both
          # are off by default — re-add only if you have a per-PR reason
          # and an exit plan.
          xcodebuild test \
            -workspace "$WORKSPACE" \
            -scheme OsaurusCoreTests \
            -disableAutomaticPackageResolution \
            -clonedSourcePackagesDirPath "$SPM_CACHE" \
            -resultBundlePath "$XCRESULT_PATH" \
            -skipPackagePluginValidation \
            -skipMacroValidation \
            -enableCodeCoverage NO \
            -test-timeouts-enabled YES \
            -default-test-execution-time-allowance 60 \
            -maximum-test-execution-time-allowance 120 \
            COMPILER_INDEX_STORE_ENABLE=NO \
            SWIFT_COMPILATION_MODE=incremental \
            | xcbeautify --renderer github-actions --is-ci

      - name: Annotate timing
        if: always()
        run: echo "::notice title=test-core duration::$SECONDS seconds"

      - name: Print failure summary
        # Also run on `cancelled()` so a job-timeout cancellation (e.g. a
        # cold build that ate the 45-min wall) still gets a Mode A diag
        # block in the GitHub UI instead of being a silent skip — see
        # PR #951 run 24937664669, where attempt 2's 27:27 cold compile
        # was killed by the prior 30-min timeout AND `Print failure
        # summary` was skipped because cancellation isn't `failure()`.
        if: ${{ failure() || cancelled() }}
        env:
          # Surface the cache outcome inside the summary so the next person
          # can immediately tell "cold-cache compile timeout" from "warm
          # cache + actual hang" without scrolling the log.
          DD_CACHE_HIT: ${{ steps.dd-cache.outputs.cache-hit }}
        run: |
          set +e
          echo "## test-core failures" >> "$GITHUB_STEP_SUMMARY"

          # Four real failure modes we want to disambiguate. Each conflation
          # has historically cost us hours of misdirected debugging, so this
          # step exists specifically to put the right diagnosis at the top
          # of the GitHub job summary instead of forcing the next person to
          # re-derive it from raw logs.
          #
          # Mode A — xctest binary doesn't exist on disk
          #          ⇒ build phase didn't finish (compile error, OR cold
          #          cache + 30-min wall-timeout fired mid-Swift-compile,
          #          OR linker failure). PR #881 (run 24573707695) hit this
          #          flavor: both SPM and DerivedData caches missed because
          #          main hadn't ever saved one, so the cold build ran out
          #          of clock during OsaurusCoreTests Swift compilation.
          #
          # Mode B — xctest binary built BUT no xcresult bundle
          #          ⇒ test bundle launched but produced zero output before
          #          the wall-timeout / cancellation. Real launch-time hang
          #          (dyld load, +load methods, or first-test module init).
          #          PR #878 (run 24563175247) hit this flavor — see the
          #          comment block atop `_registerFactoriesOnce` in
          #          ModelRuntime.swift.
          #
          # Mode C — xcresult exists with zero test cases
          #          ⇒ test execution was killed mid-flight. Look for the
          #          last `Test Case ... started` line in the raw log;
          #          that's the prime suspect for the hang. (`--quiet` is
          #          intentionally NOT passed to xcbeautify so the line is
          #          present.)
          #
          # Mode D — xcresult exists with failed test cases
          #          ⇒ ordinary test failures; render the failures table.

          # Resolve the xctest binary path. Xcode hashes the DerivedData
          # subdirectory off the workspace path, so glob rather than
          # hardcode. We only care about existence, not the actual binary.
          XCTEST_BINARY="$(find "$HOME/Library/Developer/Xcode/DerivedData" \
            -maxdepth 6 -name 'OsaurusCoreTests.xctest' -type d 2>/dev/null \
            | head -1)"

          if [ ! -d "$XCRESULT_PATH" ]; then
            if [ -z "$XCTEST_BINARY" ]; then
              # Mode A.
              CACHE_NOTE="_(DerivedData cache hit: \`${DD_CACHE_HIT:-unknown}\`, run attempt: \`${{ github.run_attempt }}\`)_"
              {
                echo "**Mode A — build phase did not complete (no xctest bundle on disk).**"
                echo
                echo "Either a compile/link error fired (scroll the **Test OsaurusCore** log above for the first \`error:\` line), OR the cold build ran past the 45-min job timeout. ${CACHE_NOTE}"
                echo
                echo "If \`cache-hit: false\` AND no \`error:\` lines appear in the raw log, this is the cold-cache-timeout flavor. The fix is to land one successful run on \`main\` so the \`Save DerivedData cache\` step at the bottom of this job populates the cache; subsequent PR runs warm-start from it and finish in ~5 min. Re-running this same job will hit the cache the second time only IF the first attempt finishes successfully."
                echo
                echo "**\`run_attempt > 1\` AND \`cache-hit: false\`?** That's the deliberate cold-rebuild path triggered by **Re-run failed jobs** — see the \`Wipe restored DerivedData\` step in this job. If the cold build is exhausting the 45-min budget on every re-run, the codebase has outgrown the budget; bump \`timeout-minutes\` and update its comment block, OR move warm-cache priming to a nightly \`main\` job so PRs always warm-start."
                echo
                echo "**Suspect cache poisoning on a fresh attempt?** Pull requests already cold-build DerivedData; main/manual re-runs wipe DerivedData automatically while preserving the pinned SPM source cache."
              } >> "$GITHUB_STEP_SUMMARY"
            else
              # Mode B.
              {
                echo "**Mode B — xctest bundle built but launched silently (real launch-time hang).**"
                echo
                echo "The \`OsaurusCoreTests.xctest\` binary exists in DerivedData (\`$XCTEST_BINARY\`), so the build phase finished. The hang is in dyld load, an ObjC \`+load\` method, a Swift module initializer, or the first test's module init."
                echo
                echo "Prime suspects: any file-level \`let\` in OsaurusCore that touches MLX/Metal at first reference (this is the regression class that PR #878 surfaced — see \`_registerFactoriesOnce\` in \`Packages/OsaurusCore/Services/ModelRuntime.swift\` and the matching tests in \`Packages/OsaurusCore/Tests/Service/ModelRuntimeFindDirectoryTests.swift\`)."
              } >> "$GITHUB_STEP_SUMMARY"
            fi
            exit 0
          fi

          # `xcresulttool get test-results tests` is the modern (Xcode 16+)
          # JSON view of the test outcomes. Older flag is now deprecated.
          XCRESULT_JSON="$(xcrun xcresulttool get test-results tests \
            --path "$XCRESULT_PATH" \
            --format json 2>/dev/null)"

          # Count test cases regardless of outcome so we can detect the
          # "started, killed mid-flight" case where there are no Failure
          # nodes but also no Passed nodes.
          TEST_CASE_COUNT="$(printf '%s' "$XCRESULT_JSON" \
            | jq '[ .. | objects | select(.nodeType? == "Test Case" or .nodeType? == "Test") ] | length' \
            2>/dev/null || echo 0)"

          if [ "${TEST_CASE_COUNT:-0}" = "0" ]; then
            echo "**Mode C — xcresult bundle exists but contains zero test cases.**" >> "$GITHUB_STEP_SUMMARY"
            echo "" >> "$GITHUB_STEP_SUMMARY"
            echo "Test execution was killed before any test recorded a result — most likely a hung test that ate the job wall-timeout, or a runner crash. Check the raw log for the last \`Test Case ... started\` line; that's the prime suspect for the hang." >> "$GITHUB_STEP_SUMMARY"
            exit 0
          fi
          echo "**Mode D — failed test cases:**" >> "$GITHUB_STEP_SUMMARY"
          echo "" >> "$GITHUB_STEP_SUMMARY"

          printf '%s' "$XCRESULT_JSON" | jq -r '
              [ .. | objects | select(.result? == "Failed" and (.nodeType? == "Test Case" or .nodeType? == "Test")) ]
              | if length == 0 then
                  "_No individual failed tests reported despite a non-zero exit. Likely a build error, post-test cleanup failure, or runner crash — see the raw log above._"
                else
                  ( "| Test | Failure |\n| --- | --- |" )
                  + ( map(
                      "| `" + (.name // "(unknown)") + "` | "
                      + ( ( [ .. | objects | select(.nodeType? == "Failure Message") | .name ] | join("<br>") )
                          // "_(no message)_" )
                      + " |"
                    ) | join("\n") )
                end
            ' >> "$GITHUB_STEP_SUMMARY" || true

      - name: Upload xcresult on failure
        # Same `failure() || cancelled()` rationale as the failure-summary
        # step above: on a wall-timeout the xcresult bundle may be
        # partially populated and is still useful for postmortem.
        if: ${{ failure() || cancelled() }}
        uses: actions/upload-artifact@v5
        with:
          name: test-core-xcresult-${{ github.run_attempt }}
          path: ${{ env.XCRESULT_PATH }}
          retention-days: 7
          if-no-files-found: warn

      # Save the cache only on `main` so a half-baked PR can never poison it.
      # `actions/cache/save@v5` is a no-op when the key already exists, so a
      # forced cold re-run on `main` (which wipes DerivedData and rebuilds
      # from scratch) won't overwrite a known-good cache entry under the
      # same key. To intentionally invalidate every cache, bump CACHE_SALT.
      - name: Save DerivedData cache
        if: ${{ github.ref == 'refs/heads/main' && success() && steps.dd-cache.outputs.cache-primary-key != '' }}
        uses: actions/cache/save@v5
        with:
          path: ~/Library/Developer/Xcode/DerivedData
          key: ${{ steps.dd-cache.outputs.cache-primary-key }}

  test-cli:
    # Pinned (was `macos-latest`).
    runs-on: macos-26
    timeout-minutes: 10
    steps:
      - name: Checkout code
        uses: actions/checkout@v5

      # OsaurusCLI's Package.swift requires `swift-tools-version: 6.2`, which
      # ships with the pinned Xcode. The runner's default `swift` may be older
      # (currently 6.1), so point xcrun at the pinned Xcode for these steps.
      # GitHub Actions doesn't let job-level `env:` reference workflow-level
      # `env:`, so keep XCODE_VERSION in sync here if it ever changes.
      - name: Verify Swift toolchain
        env:
          DEVELOPER_DIR: /Applications/Xcode_${{ env.XCODE_VERSION }}.app/Contents/Developer
        run: swift --version

      - name: Run CLI tests
        env:
          DEVELOPER_DIR: /Applications/Xcode_${{ env.XCODE_VERSION }}.app/Contents/Developer
        run: swift test --package-path Packages/OsaurusCLI --parallel

      - name: Annotate timing
        if: always()
        run: echo "::notice title=test-cli duration::$SECONDS seconds"

  swiftlint:
    # Pinned (was `macos-latest`).
    runs-on: macos-26
    timeout-minutes: 10
    steps:
      - name: Checkout code
        uses: actions/checkout@v5

      - name: Install SwiftLint
        run: brew install swiftlint

      - name: Run SwiftLint
        run: swiftlint lint --reporter github-actions-logging

  shellcheck:
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
      - name: Checkout code
        uses: actions/checkout@v5

      - name: Install shellcheck
        run: sudo apt-get update && sudo apt-get install -y shellcheck

      - name: Lint shell scripts
        run: find scripts -name '*.sh' -print0 | xargs -0 shellcheck --severity=warning