diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4b0b98d..cf3425e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ permissions: env: # Bump to invalidate every cache entry without source surgery (e.g., after a # known-bad cache or an Xcode toolchain upgrade we want to flush manually). - CACHE_SALT: v2-vmlx-5b84387 + CACHE_SALT: v3-pr-cold-deriveddata # Pin Xcode so cache keys are stable across runner image bumps. When you # need to upgrade, change here AND in setup-xcode below. XCODE_VERSION: "26.4.1" @@ -83,9 +83,11 @@ jobs: - name: Restore DerivedData cache id: dd-cache - # Always restore so `cache-primary-key` is populated for the save - # step at the bottom (the wipe step below handles forced cold - # builds without preventing main from repopulating the cache). + # Restore only on main pushes / manual maintainer runs. Pull requests + # intentionally cold-build DerivedData: restore-key hits have produced + # stale Swift modules whose C-module dependencies are missing when + # Xcode later compiles EventSource. + if: ${{ github.event_name != 'pull_request' }} uses: actions/cache/restore@v5 with: path: ~/Library/Developer/Xcode/DerivedData @@ -97,13 +99,15 @@ jobs: restore-keys: | dd-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}- - # Make "clear the build cache" a one-click operation. Two triggers: - # 1. `github.run_attempt != '1'` — i.e. a re-run. The default + # Make "clear the build cache" a one-click operation. Three triggers: + # 1. Pull requests — always cold-build DerivedData so PRs never trust + # a cached Xcode build product from another ref. + # 2. `github.run_attempt != '1'` — i.e. a re-run. The default # "Re-run failed jobs" button is the natural place for someone # who just saw a build failure to land, so we make that the # intuitive escape hatch for cache poison: the first attempt # uses the cache (fast); any re-run forces a cold compile. - # 2. `workflow_dispatch.clear_cache=true` — manual force-cold on + # 3. `workflow_dispatch.clear_cache=true` — manual force-cold on # a fresh run (e.g. validating a CACHE_SALT bump before PRs # start hitting it). # @@ -116,18 +120,18 @@ jobs: # every re-run cost ~2 min in PR #951 run 24937664669 — wasted # budget that contributed to the 30-min cold-build cancellation. # - # We wipe AFTER the restore step (rather than skipping the restore) - # so `steps.dd-cache.outputs.cache-primary-key` stays populated and - # the `Save DerivedData cache` step at the bottom can still - # repopulate the cache on a successful `main` run. - - name: Wipe restored DerivedData (re-run or workflow_dispatch clear_cache) - if: ${{ github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }} + # On main/manual runs we wipe AFTER the restore step (rather than + # skipping the restore) so `steps.dd-cache.outputs.cache-primary-key` + # stays populated and the `Save DerivedData cache` step at the bottom + # can still repopulate the cache on a successful `main` run. + - name: Wipe restored DerivedData (PR, re-run, or workflow_dispatch clear_cache) + if: ${{ github.event_name == 'pull_request' || github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }} run: | - REASON="run_attempt=${{ github.run_attempt }}" + REASON="event=${{ github.event_name }}, run_attempt=${{ github.run_attempt }}" if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.clear_cache }}" = "true" ]; then REASON="$REASON, workflow_dispatch clear_cache=true" fi - echo "::notice title=Cold build forced::Wiping restored DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved). To re-run with the warm cache instead, push a new commit or trigger a fresh run." + echo "::notice title=Cold build forced::Wiping DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved)." rm -rf "$HOME/Library/Developer/Xcode/DerivedData" - name: Resolve dependencies @@ -248,7 +252,7 @@ jobs: echo echo "**\`run_attempt > 1\` AND \`cache-hit: false\`?** That's the deliberate cold-rebuild path triggered by **Re-run failed jobs** — see the \`Wipe restored DerivedData\` step in this job. If the cold build is exhausting the 45-min budget on every re-run, the codebase has outgrown the budget; bump \`timeout-minutes\` and update its comment block, OR move warm-cache priming to a nightly \`main\` job so PRs always warm-start." echo - echo "**Suspect cache poisoning on a fresh attempt?** Click **Re-run failed jobs** — re-runs automatically wipe DerivedData (the SPM cache is preserved because it's pinned by \`Package.resolved\` and can't be poisoned)." + echo "**Suspect cache poisoning on a fresh attempt?** Pull requests already cold-build DerivedData; main/manual re-runs wipe DerivedData automatically while preserving the pinned SPM source cache." } >> "$GITHUB_STEP_SUMMARY" else # Mode B. diff --git a/Packages/OsaurusCore/Managers/TTSService.swift b/Packages/OsaurusCore/Managers/TTSService.swift index 7c42b161..cd461115 100644 --- a/Packages/OsaurusCore/Managers/TTSService.swift +++ b/Packages/OsaurusCore/Managers/TTSService.swift @@ -157,25 +157,9 @@ public final class TTSService: ObservableObject { let voice = TTSConfigurationStore.load().voice initTask = Task { [weak self] in do { - // Route through the downloader explicitly so we get progress callbacks. - // When models are already cached this returns nearly instantly. - _ = try await PocketTtsResourceDownloader.ensureModels( - directory: nil, - progressHandler: { progress in - Task { @MainActor in - guard let self else { return } - let fraction: Double? - switch progress.phase { - case .downloading: - fraction = progress.fractionCompleted - case .listing, .compiling: - fraction = nil - } - self.modelState = .downloading(fraction: fraction) - } - } - ) - + // Let FluidAudio pick its default language pack so this call + // stays compatible across the workspace-pinned and package + // resolved PocketTTS APIs. let mgr = PocketTtsManager(defaultVoice: voice) try await mgr.initialize() await MainActor.run { @@ -216,9 +200,17 @@ public final class TTSService: ObservableObject { .appendingPathComponent("fluidaudio", isDirectory: true) .appendingPathComponent("Models", isDirectory: true) .appendingPathComponent("pocket-tts", isDirectory: true) + let candidateDirs = [ + repoDir, + repoDir + .appendingPathComponent("v2", isDirectory: true) + .appendingPathComponent("english", isDirectory: true) + ] let required = ModelNames.PocketTTS.requiredModels let fm = FileManager.default - return required.allSatisfy { fm.fileExists(atPath: repoDir.appendingPathComponent($0).path) } + return candidateDirs.contains { directory in + required.allSatisfy { fm.fileExists(atPath: directory.appendingPathComponent($0).path) } + } } // MARK: - Playback diff --git a/Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift b/Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift index f5c1795f..4ec5902c 100644 --- a/Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift +++ b/Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift @@ -8,6 +8,7 @@ // hanging the agent loop indefinitely. // +import Dispatch import Foundation import Testing @@ -31,6 +32,25 @@ struct ToolRegistryTimeoutTests { } } + /// Tool body that ignores cooperative Swift cancellation without burning a + /// Swift concurrency executor thread. This mirrors process / blocking I/O + /// classes where returning a timeout envelope must not wait for the losing + /// branch to drain. + private struct BlockingSleepTool: OsaurusTool { + let name: String = "test_blocking_sleep" + let description: String = "Test fixture: completes later than the timeout." + let parameters: JSONValue? = .object(["type": .string("object")]) + + func execute(argumentsJSON: String) async throws -> String { + await withCheckedContinuation { continuation in + DispatchQueue.global().asyncAfter(deadline: .now() + 5.0) { + continuation.resume() + } + } + return ToolEnvelope.success(tool: name, text: "did not time out") + } + } + /// Tool body that completes well within the test timeout. Used as a /// happy-path control to confirm the timeout race doesn't fire /// spuriously on fast tools. @@ -75,6 +95,28 @@ struct ToolRegistryTimeoutTests { #expect(elapsed < 4.0, "took \(elapsed)s — expected <4s if timeout race fired") } + @Test + func blockingToolReturnsTimeoutWithoutWaitingForBodyToDrain() async throws { + let tool = BlockingSleepTool() + let started = Date() + let result = try await ToolRegistry.runToolBody( + tool, + argumentsJSON: "{}", + timeoutSeconds: 0.1 + ) + let elapsed = Date().timeIntervalSince(started) + + #expect(ToolEnvelope.isError(result)) + let data = result.data(using: .utf8)! + let parsed = try JSONSerialization.jsonObject(with: data) as? [String: Any] + #expect(parsed?["kind"] as? String == "timeout") + #expect(parsed?["tool"] as? String == tool.name) + // The body ignores cancellation and finishes after 5s. Keeping + // this below 4s proves the timeout path returned without draining + // the blocked body while leaving room for busy CI runners. + #expect(elapsed < 4.0, "took \(elapsed)s — timeout waited for the blocked body") + } + @Test func fastToolReturnsItsOwnResultBeforeTimeoutFires() async throws { let tool = FastEchoTool() diff --git a/Packages/OsaurusCore/Tools/ToolRegistry.swift b/Packages/OsaurusCore/Tools/ToolRegistry.swift index 88f1ff1e..ecfa1960 100644 --- a/Packages/OsaurusCore/Tools/ToolRegistry.swift +++ b/Packages/OsaurusCore/Tools/ToolRegistry.swift @@ -5,8 +5,69 @@ // Central registry for chat tools. Provides OpenAI tool specs and execution by name. // -import Foundation import Combine +import Dispatch +import Foundation + +private final class ToolBodyTimeoutRaceState: @unchecked Sendable { + private let lock = NSLock() + private var continuation: CheckedContinuation? + private var bodyTask: Task? + private var timeoutTimer: DispatchSourceTimer? + private var cancelBodyWhenSet = false + private var cancelTimeoutWhenSet = false + + init(continuation: CheckedContinuation) { + self.continuation = continuation + } + + func setBodyTask(_ body: Task) { + lock.lock() + bodyTask = body + let shouldCancelBody = cancelBodyWhenSet + lock.unlock() + + if shouldCancelBody { body.cancel() } + } + + func setTimeoutTimer(_ timeout: DispatchSourceTimer) { + lock.lock() + timeoutTimer = timeout + let shouldCancelTimeout = cancelTimeoutWhenSet + lock.unlock() + + if shouldCancelTimeout { + timeout.setEventHandler {} + timeout.cancel() + } + } + + func finish(with result: String, cancelBody: Bool, cancelTimeout: Bool) { + lock.lock() + guard let continuation else { + lock.unlock() + return + } + self.continuation = nil + + let bodyToCancel = cancelBody ? bodyTask : nil + let timeoutToCancel = timeoutTimer + if cancelBody, bodyTask == nil { + cancelBodyWhenSet = true + } + if cancelTimeout, timeoutTimer == nil { + cancelTimeoutWhenSet = true + } + bodyTask = nil + timeoutTimer = nil + lock.unlock() + + bodyToCancel?.cancel() + timeoutToCancel?.setEventHandler {} + timeoutToCancel?.cancel() + continuation.resume(returning: result) + } +} @MainActor final class ToolRegistry: ObservableObject { @@ -338,7 +399,7 @@ final class ToolRegistry: ObservableObject { /// fall through unchanged: parsing is best-effort, and tool bodies /// keep their richer `requireXxx` helpers as the second line of /// defence. - private nonisolated static func preflight( + nonisolated private static func preflight( argumentsJSON: String, schema: JSONValue?, toolName: String @@ -392,14 +453,13 @@ final class ToolRegistry: ObservableObject { /// tests can drive it with a small `timeoutSeconds` value without /// waiting for the full 120s production budget. /// - /// Each branch of the race converts thrown errors (including - /// `CancellationError` from the loser when we `cancelAll`) into a - /// structured `ToolEnvelope` *inside* its child task. That keeps - /// `withTaskGroup` non-throwing and prevents the cancelled sibling's - /// post-return throw from reaching the caller as the function's - /// error — historically the slow-tool case rethrew CancellationError - /// and stalled while the group drained. - internal nonisolated static func runToolBody( + /// The body and timeout run as unstructured tasks rather than a task + /// group. That is intentional: task-group scope exit drains cancelled + /// children, so a non-cooperative tool body can still delay the timeout + /// response until it returns. The race state resumes the caller once and + /// cancels the loser without waiting for that loser to observe + /// cancellation. + nonisolated static func runToolBody( _ tool: OsaurusTool, argumentsJSON: String, timeoutSeconds: TimeInterval @@ -412,49 +472,33 @@ final class ToolRegistry: ObservableObject { tool: toolName, retryable: true ) - // Sentinel returned by the cancelled loser branch so the - // consumer loop knows to ignore it. Cannot collide with any - // legitimate envelope because real envelopes are JSON. - let cancelledSentinel = "__osaurus_runToolBody_cancelled__" + return await withCheckedContinuation { continuation in + let race = ToolBodyTimeoutRaceState(continuation: continuation) + let timeoutTimer = DispatchSource.makeTimerSource(queue: .global(qos: .utility)) + let nanos = Int((max(0, timeoutSeconds) * 1_000_000_000).rounded(.up)) + timeoutTimer.schedule(deadline: .now() + .nanoseconds(nanos)) + timeoutTimer.setEventHandler { + race.finish(with: timeoutEnvelope, cancelBody: true, cancelTimeout: false) + } + race.setTimeoutTimer(timeoutTimer) + timeoutTimer.resume() - return await withTaskGroup(of: String.self) { group in - group.addTask { + let bodyTask = Task { do { - return try await tool.execute(argumentsJSON: argumentsJSON) + let result = try await tool.execute(argumentsJSON: argumentsJSON) + race.finish(with: result, cancelBody: false, cancelTimeout: true) } catch is CancellationError { - return cancelledSentinel + // A cooperative loser should not overwrite the timeout + // envelope. If cancellation happened before the timeout + // fired, the timeout timer remains responsible for the + // structured result. + return } catch { - return ToolEnvelope.fromError(error, tool: toolName) + let result = ToolEnvelope.fromError(error, tool: toolName) + race.finish(with: result, cancelBody: false, cancelTimeout: true) } } - group.addTask { - let nanos = UInt64(timeoutSeconds * 1_000_000_000) - do { - try await Task.sleep(nanoseconds: nanos) - } catch { - // Cancelled because the body finished first — yield - // the sentinel so the caller's first non-sentinel - // result wins. - return cancelledSentinel - } - return timeoutEnvelope - } - - // The first non-sentinel result is the winner; cancel the - // sibling and let `withTaskGroup` auto-drain on closure - // return. The drain is safe because every child branch - // converts its own errors into envelope strings — there - // are no uncaught throws to surface. - for await result in group { - if result == cancelledSentinel { continue } - group.cancelAll() - return result - } - return ToolEnvelope.failure( - kind: .executionError, - message: "Tool '\(toolName)' produced no result.", - tool: toolName - ) + race.setBodyTask(bodyTask) } }