osaurus-ai · mimeding · Apr 29, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ permissions:
 env:
   # Bump to invalidate every cache entry without source surgery (e.g., after a
   # known-bad cache or an Xcode toolchain upgrade we want to flush manually).
-  CACHE_SALT: v2-vmlx-5b84387
+  CACHE_SALT: v3-pr-cold-deriveddata
   # Pin Xcode so cache keys are stable across runner image bumps. When you
   # need to upgrade, change here AND in setup-xcode below.
   XCODE_VERSION: "26.4.1"
@@ -83,9 +83,11 @@ jobs:
 
       - name: Restore DerivedData cache
         id: dd-cache
-        # Always restore so `cache-primary-key` is populated for the save
-        # step at the bottom (the wipe step below handles forced cold
-        # builds without preventing main from repopulating the cache).
+        # Restore only on main pushes / manual maintainer runs. Pull requests
+        # intentionally cold-build DerivedData: restore-key hits have produced
+        # stale Swift modules whose C-module dependencies are missing when
+        # Xcode later compiles EventSource.
+        if: ${{ github.event_name != 'pull_request' }}
         uses: actions/cache/restore@v5
         with:
           path: ~/Library/Developer/Xcode/DerivedData
@@ -97,13 +99,15 @@ jobs:
           restore-keys: |
             dd-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}-
 
-      # Make "clear the build cache" a one-click operation. Two triggers:
-      #   1. `github.run_attempt != '1'` — i.e. a re-run. The default
+      # Make "clear the build cache" a one-click operation. Three triggers:
+      #   1. Pull requests — always cold-build DerivedData so PRs never trust
+      #      a cached Xcode build product from another ref.
+      #   2. `github.run_attempt != '1'` — i.e. a re-run. The default
       #      "Re-run failed jobs" button is the natural place for someone
       #      who just saw a build failure to land, so we make that the
       #      intuitive escape hatch for cache poison: the first attempt
       #      uses the cache (fast); any re-run forces a cold compile.
-      #   2. `workflow_dispatch.clear_cache=true` — manual force-cold on
+      #   3. `workflow_dispatch.clear_cache=true` — manual force-cold on
       #      a fresh run (e.g. validating a CACHE_SALT bump before PRs
       #      start hitting it).
       #
@@ -116,18 +120,18 @@ jobs:
       # every re-run cost ~2 min in PR #951 run 24937664669 — wasted
       # budget that contributed to the 30-min cold-build cancellation.
       #
-      # We wipe AFTER the restore step (rather than skipping the restore)
-      # so `steps.dd-cache.outputs.cache-primary-key` stays populated and
-      # the `Save DerivedData cache` step at the bottom can still
-      # repopulate the cache on a successful `main` run.
-      - name: Wipe restored DerivedData (re-run or workflow_dispatch clear_cache)
-        if: ${{ github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }}
+      # On main/manual runs we wipe AFTER the restore step (rather than
+      # skipping the restore) so `steps.dd-cache.outputs.cache-primary-key`
+      # stays populated and the `Save DerivedData cache` step at the bottom
+      # can still repopulate the cache on a successful `main` run.
+      - name: Wipe restored DerivedData (PR, re-run, or workflow_dispatch clear_cache)
+        if: ${{ github.event_name == 'pull_request' || github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }}
         run: |
-          REASON="run_attempt=${{ github.run_attempt }}"
+          REASON="event=${{ github.event_name }}, run_attempt=${{ github.run_attempt }}"
           if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.clear_cache }}" = "true" ]; then
             REASON="$REASON, workflow_dispatch clear_cache=true"
           fi
-          echo "::notice title=Cold build forced::Wiping restored DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved). To re-run with the warm cache instead, push a new commit or trigger a fresh run."
+          echo "::notice title=Cold build forced::Wiping DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved)."
           rm -rf "$HOME/Library/Developer/Xcode/DerivedData"
 
       - name: Resolve dependencies
@@ -248,7 +252,7 @@ jobs:
                 echo
                 echo "**\`run_attempt > 1\` AND \`cache-hit: false\`?** That's the deliberate cold-rebuild path triggered by **Re-run failed jobs** — see the \`Wipe restored DerivedData\` step in this job. If the cold build is exhausting the 45-min budget on every re-run, the codebase has outgrown the budget; bump \`timeout-minutes\` and update its comment block, OR move warm-cache priming to a nightly \`main\` job so PRs always warm-start."
                 echo
-                echo "**Suspect cache poisoning on a fresh attempt?** Click **Re-run failed jobs** — re-runs automatically wipe DerivedData (the SPM cache is preserved because it's pinned by \`Package.resolved\` and can't be poisoned)."
+                echo "**Suspect cache poisoning on a fresh attempt?** Pull requests already cold-build DerivedData; main/manual re-runs wipe DerivedData automatically while preserving the pinned SPM source cache."
               } >> "$GITHUB_STEP_SUMMARY"
             else
               # Mode B.

diff --git a/Packages/OsaurusCore/Managers/TTSService.swift b/Packages/OsaurusCore/Managers/TTSService.swift
@@ -157,25 +157,9 @@ public final class TTSService: ObservableObject {
         let voice = TTSConfigurationStore.load().voice
         initTask = Task { [weak self] in
             do {
-                // Route through the downloader explicitly so we get progress callbacks.
-                // When models are already cached this returns nearly instantly.
-                _ = try await PocketTtsResourceDownloader.ensureModels(
-                    directory: nil,
-                    progressHandler: { progress in
-                        Task { @MainActor in
-                            guard let self else { return }
-                            let fraction: Double?
-                            switch progress.phase {
-                            case .downloading:
-                                fraction = progress.fractionCompleted
-                            case .listing, .compiling:
-                                fraction = nil
-                            }
-                            self.modelState = .downloading(fraction: fraction)
-                        }
-                    }
-                )
-
+                // Let FluidAudio pick its default language pack so this call
+                // stays compatible across the workspace-pinned and package
+                // resolved PocketTTS APIs.
                 let mgr = PocketTtsManager(defaultVoice: voice)
                 try await mgr.initialize()
                 await MainActor.run {
@@ -216,9 +200,17 @@ public final class TTSService: ObservableObject {
             .appendingPathComponent("fluidaudio", isDirectory: true)
             .appendingPathComponent("Models", isDirectory: true)
             .appendingPathComponent("pocket-tts", isDirectory: true)
+        let candidateDirs = [
+            repoDir,
+            repoDir
+                .appendingPathComponent("v2", isDirectory: true)
+                .appendingPathComponent("english", isDirectory: true)
+        ]
         let required = ModelNames.PocketTTS.requiredModels
         let fm = FileManager.default
-        return required.allSatisfy { fm.fileExists(atPath: repoDir.appendingPathComponent($0).path) }
+        return candidateDirs.contains { directory in
+            required.allSatisfy { fm.fileExists(atPath: directory.appendingPathComponent($0).path) }
+        }
     }
 
     // MARK: - Playback

diff --git a/Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift b/Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift
@@ -8,6 +8,7 @@
 //  hanging the agent loop indefinitely.
 //
 
+import Dispatch
 import Foundation
 import Testing
 
@@ -31,6 +32,25 @@ struct ToolRegistryTimeoutTests {
         }
     }
 
+    /// Tool body that ignores cooperative Swift cancellation without burning a
+    /// Swift concurrency executor thread. This mirrors process / blocking I/O
+    /// classes where returning a timeout envelope must not wait for the losing
+    /// branch to drain.
+    private struct BlockingSleepTool: OsaurusTool {
+        let name: String = "test_blocking_sleep"
+        let description: String = "Test fixture: completes later than the timeout."
+        let parameters: JSONValue? = .object(["type": .string("object")])
+
+        func execute(argumentsJSON: String) async throws -> String {
+            await withCheckedContinuation { continuation in
+                DispatchQueue.global().asyncAfter(deadline: .now() + 1.2) {
+                    continuation.resume()
+                }
+            }
+            return ToolEnvelope.success(tool: name, text: "did not time out")
+        }
+    }
+
     /// Tool body that completes well within the test timeout. Used as a
     /// happy-path control to confirm the timeout race doesn't fire
     /// spuriously on fast tools.
@@ -75,6 +95,25 @@ struct ToolRegistryTimeoutTests {
         #expect(elapsed < 4.0, "took \(elapsed)s — expected <4s if timeout race fired")
     }
 
+    @Test
+    func blockingToolReturnsTimeoutWithoutWaitingForBodyToDrain() async throws {
+        let tool = BlockingSleepTool()
+        let started = Date()
+        let result = try await ToolRegistry.runToolBody(
+            tool,
+            argumentsJSON: "{}",
+            timeoutSeconds: 0.1
+        )
+        let elapsed = Date().timeIntervalSince(started)
+
+        #expect(ToolEnvelope.isError(result))
+        let data = result.data(using: .utf8)!
+        let parsed = try JSONSerialization.jsonObject(with: data) as? [String: Any]
+        #expect(parsed?["kind"] as? String == "timeout")
+        #expect(parsed?["tool"] as? String == tool.name)
+        #expect(elapsed < 1.0, "took \(elapsed)s — timeout waited for the blocked body")
+    }
+
     @Test
     func fastToolReturnsItsOwnResultBeforeTimeoutFires() async throws {
         let tool = FastEchoTool()

diff --git a/Packages/OsaurusCore/Tools/ToolRegistry.swift b/Packages/OsaurusCore/Tools/ToolRegistry.swift
@@ -8,6 +8,54 @@
 import Foundation
 import Combine
 
+private final class ToolBodyTimeoutRaceState: @unchecked Sendable {
+    private let lock = NSLock()
+    private var continuation: CheckedContinuation<String, Never>?
+    private var bodyTask: Task<Void, Never>?
+    private var timeoutTask: Task<Void, Never>?
+    private var cancelBodyWhenSet = false
+    private var cancelTimeoutWhenSet = false
+
+    init(continuation: CheckedContinuation<String, Never>) {
+        self.continuation = continuation
+    }
+
+    func setTasks(body: Task<Void, Never>, timeout: Task<Void, Never>) {
+        lock.lock()
+        bodyTask = body
+        timeoutTask = timeout
+        let shouldCancelBody = cancelBodyWhenSet
+        let shouldCancelTimeout = cancelTimeoutWhenSet
+        lock.unlock()
+
+        if shouldCancelBody { body.cancel() }
+        if shouldCancelTimeout { timeout.cancel() }
+    }
+
+    func finish(with result: String, cancelBody: Bool, cancelTimeout: Bool) {
+        lock.lock()
+        guard let continuation else {
+            lock.unlock()
+            return
+        }
+        self.continuation = nil
+
+        let bodyToCancel = cancelBody ? bodyTask : nil
+        let timeoutToCancel = cancelTimeout ? timeoutTask : nil
+        if cancelBody, bodyTask == nil {
+            cancelBodyWhenSet = true
+        }
+        if cancelTimeout, timeoutTask == nil {
+            cancelTimeoutWhenSet = true
+        }
+        lock.unlock()
+
+        bodyToCancel?.cancel()
+        timeoutToCancel?.cancel()
+        continuation.resume(returning: result)
+    }
+}
+
 @MainActor
 final class ToolRegistry: ObservableObject {
     static let shared = ToolRegistry()
@@ -338,7 +386,7 @@ final class ToolRegistry: ObservableObject {
     /// fall through unchanged: parsing is best-effort, and tool bodies
     /// keep their richer `requireXxx` helpers as the second line of
     /// defence.
-    private nonisolated static func preflight(
+    nonisolated private static func preflight(
         argumentsJSON: String,
         schema: JSONValue?,
         toolName: String
@@ -392,14 +440,13 @@ final class ToolRegistry: ObservableObject {
     /// tests can drive it with a small `timeoutSeconds` value without
     /// waiting for the full 120s production budget.
     ///
-    /// Each branch of the race converts thrown errors (including
-    /// `CancellationError` from the loser when we `cancelAll`) into a
-    /// structured `ToolEnvelope` *inside* its child task. That keeps
-    /// `withTaskGroup` non-throwing and prevents the cancelled sibling's
-    /// post-return throw from reaching the caller as the function's
-    /// error — historically the slow-tool case rethrew CancellationError
-    /// and stalled while the group drained.
-    internal nonisolated static func runToolBody(
+    /// The body and timeout run as unstructured tasks rather than a task
+    /// group. That is intentional: task-group scope exit drains cancelled
+    /// children, so a non-cooperative tool body can still delay the timeout
+    /// response until it returns. The race state resumes the caller once and
+    /// cancels the loser without waiting for that loser to observe
+    /// cancellation.
+    nonisolated static func runToolBody(
         _ tool: OsaurusTool,
         argumentsJSON: String,
         timeoutSeconds: TimeInterval
@@ -412,49 +459,33 @@ final class ToolRegistry: ObservableObject {
             tool: toolName,
             retryable: true
         )
-        // Sentinel returned by the cancelled loser branch so the
-        // consumer loop knows to ignore it. Cannot collide with any
-        // legitimate envelope because real envelopes are JSON.
-        let cancelledSentinel = "__osaurus_runToolBody_cancelled__"
-
-        return await withTaskGroup(of: String.self) { group in
-            group.addTask {
+        return await withCheckedContinuation { continuation in
+            let race = ToolBodyTimeoutRaceState(continuation: continuation)
+            let bodyTask = Task {
                 do {
-                    return try await tool.execute(argumentsJSON: argumentsJSON)
+                    let result = try await tool.execute(argumentsJSON: argumentsJSON)
+                    race.finish(with: result, cancelBody: false, cancelTimeout: true)
                 } catch is CancellationError {
-                    return cancelledSentinel
+                    // A cooperative loser should not overwrite the timeout
+                    // envelope. If cancellation happened before the timeout
+                    // fired, the timeout task remains responsible for the
+                    // structured result.
+                    return
                 } catch {
-                    return ToolEnvelope.fromError(error, tool: toolName)
+                    let result = ToolEnvelope.fromError(error, tool: toolName)
+                    race.finish(with: result, cancelBody: false, cancelTimeout: true)
                 }
             }
-            group.addTask {
-                let nanos = UInt64(timeoutSeconds * 1_000_000_000)
+            let timeoutTask = Task {
+                let nanos = UInt64(max(0, timeoutSeconds) * 1_000_000_000)
                 do {
                     try await Task.sleep(nanoseconds: nanos)
                 } catch {
-                    // Cancelled because the body finished first — yield
-                    // the sentinel so the caller's first non-sentinel
-                    // result wins.
-                    return cancelledSentinel
+                    return
                 }
-                return timeoutEnvelope
-            }
-
-            // The first non-sentinel result is the winner; cancel the
-            // sibling and let `withTaskGroup` auto-drain on closure
-            // return. The drain is safe because every child branch
-            // converts its own errors into envelope strings — there
-            // are no uncaught throws to surface.
-            for await result in group {
-                if result == cancelledSentinel { continue }
-                group.cancelAll()
-                return result
+                race.finish(with: timeoutEnvelope, cancelBody: true, cancelTimeout: false)
             }
-            return ToolEnvelope.failure(
-                kind: .executionError,
-                message: "Tool '\(toolName)' produced no result.",
-                tool: toolName
-            )
+            race.setTasks(body: bodyTask, timeout: timeoutTask)
         }
     }