Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ permissions:
env:
# Bump to invalidate every cache entry without source surgery (e.g., after a
# known-bad cache or an Xcode toolchain upgrade we want to flush manually).
CACHE_SALT: v2-vmlx-5b84387
CACHE_SALT: v3-pr-cold-deriveddata
# Pin Xcode so cache keys are stable across runner image bumps. When you
# need to upgrade, change here AND in setup-xcode below.
XCODE_VERSION: "26.4.1"
Expand Down Expand Up @@ -83,9 +83,11 @@ jobs:

- name: Restore DerivedData cache
id: dd-cache
# Always restore so `cache-primary-key` is populated for the save
# step at the bottom (the wipe step below handles forced cold
# builds without preventing main from repopulating the cache).
# Restore only on main pushes / manual maintainer runs. Pull requests
# intentionally cold-build DerivedData: restore-key hits have produced
# stale Swift modules whose C-module dependencies are missing when
# Xcode later compiles EventSource.
if: ${{ github.event_name != 'pull_request' }}
uses: actions/cache/restore@v5
with:
path: ~/Library/Developer/Xcode/DerivedData
Expand All @@ -97,13 +99,15 @@ jobs:
restore-keys: |
dd-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}-

# Make "clear the build cache" a one-click operation. Two triggers:
# 1. `github.run_attempt != '1'` — i.e. a re-run. The default
# Make "clear the build cache" a one-click operation. Three triggers:
# 1. Pull requests — always cold-build DerivedData so PRs never trust
# a cached Xcode build product from another ref.
# 2. `github.run_attempt != '1'` — i.e. a re-run. The default
# "Re-run failed jobs" button is the natural place for someone
# who just saw a build failure to land, so we make that the
# intuitive escape hatch for cache poison: the first attempt
# uses the cache (fast); any re-run forces a cold compile.
# 2. `workflow_dispatch.clear_cache=true` — manual force-cold on
# 3. `workflow_dispatch.clear_cache=true` — manual force-cold on
# a fresh run (e.g. validating a CACHE_SALT bump before PRs
# start hitting it).
#
Expand All @@ -116,18 +120,18 @@ jobs:
# every re-run cost ~2 min in PR #951 run 24937664669 — wasted
# budget that contributed to the 30-min cold-build cancellation.
#
# We wipe AFTER the restore step (rather than skipping the restore)
# so `steps.dd-cache.outputs.cache-primary-key` stays populated and
# the `Save DerivedData cache` step at the bottom can still
# repopulate the cache on a successful `main` run.
- name: Wipe restored DerivedData (re-run or workflow_dispatch clear_cache)
if: ${{ github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }}
# On main/manual runs we wipe AFTER the restore step (rather than
# skipping the restore) so `steps.dd-cache.outputs.cache-primary-key`
# stays populated and the `Save DerivedData cache` step at the bottom
# can still repopulate the cache on a successful `main` run.
- name: Wipe restored DerivedData (PR, re-run, or workflow_dispatch clear_cache)
if: ${{ github.event_name == 'pull_request' || github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }}
run: |
REASON="run_attempt=${{ github.run_attempt }}"
REASON="event=${{ github.event_name }}, run_attempt=${{ github.run_attempt }}"
if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.clear_cache }}" = "true" ]; then
REASON="$REASON, workflow_dispatch clear_cache=true"
fi
echo "::notice title=Cold build forced::Wiping restored DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved). To re-run with the warm cache instead, push a new commit or trigger a fresh run."
echo "::notice title=Cold build forced::Wiping DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved)."
rm -rf "$HOME/Library/Developer/Xcode/DerivedData"

- name: Resolve dependencies
Expand Down Expand Up @@ -248,7 +252,7 @@ jobs:
echo
echo "**\`run_attempt > 1\` AND \`cache-hit: false\`?** That's the deliberate cold-rebuild path triggered by **Re-run failed jobs** — see the \`Wipe restored DerivedData\` step in this job. If the cold build is exhausting the 45-min budget on every re-run, the codebase has outgrown the budget; bump \`timeout-minutes\` and update its comment block, OR move warm-cache priming to a nightly \`main\` job so PRs always warm-start."
echo
echo "**Suspect cache poisoning on a fresh attempt?** Click **Re-run failed jobs** — re-runs automatically wipe DerivedData (the SPM cache is preserved because it's pinned by \`Package.resolved\` and can't be poisoned)."
echo "**Suspect cache poisoning on a fresh attempt?** Pull requests already cold-build DerivedData; main/manual re-runs wipe DerivedData automatically while preserving the pinned SPM source cache."
} >> "$GITHUB_STEP_SUMMARY"
else
# Mode B.
Expand Down
32 changes: 12 additions & 20 deletions Packages/OsaurusCore/Managers/TTSService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -157,25 +157,9 @@ public final class TTSService: ObservableObject {
let voice = TTSConfigurationStore.load().voice
initTask = Task { [weak self] in
do {
// Route through the downloader explicitly so we get progress callbacks.
// When models are already cached this returns nearly instantly.
_ = try await PocketTtsResourceDownloader.ensureModels(
directory: nil,
progressHandler: { progress in
Task { @MainActor in
guard let self else { return }
let fraction: Double?
switch progress.phase {
case .downloading:
fraction = progress.fractionCompleted
case .listing, .compiling:
fraction = nil
}
self.modelState = .downloading(fraction: fraction)
}
}
)

// Let FluidAudio pick its default language pack so this call
// stays compatible across the workspace-pinned and package
// resolved PocketTTS APIs.
let mgr = PocketTtsManager(defaultVoice: voice)
try await mgr.initialize()
await MainActor.run {
Expand Down Expand Up @@ -216,9 +200,17 @@ public final class TTSService: ObservableObject {
.appendingPathComponent("fluidaudio", isDirectory: true)
.appendingPathComponent("Models", isDirectory: true)
.appendingPathComponent("pocket-tts", isDirectory: true)
let candidateDirs = [
repoDir,
repoDir
.appendingPathComponent("v2", isDirectory: true)
.appendingPathComponent("english", isDirectory: true)
]
let required = ModelNames.PocketTTS.requiredModels
let fm = FileManager.default
return required.allSatisfy { fm.fileExists(atPath: repoDir.appendingPathComponent($0).path) }
return candidateDirs.contains { directory in
required.allSatisfy { fm.fileExists(atPath: directory.appendingPathComponent($0).path) }
}
}

// MARK: - Playback
Expand Down
39 changes: 39 additions & 0 deletions Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
// hanging the agent loop indefinitely.
//

import Dispatch
import Foundation
import Testing

Expand All @@ -31,6 +32,25 @@ struct ToolRegistryTimeoutTests {
}
}

/// Tool body that ignores cooperative Swift cancellation without burning a
/// Swift concurrency executor thread. This mirrors process / blocking I/O
/// classes where returning a timeout envelope must not wait for the losing
/// branch to drain.
private struct BlockingSleepTool: OsaurusTool {
let name: String = "test_blocking_sleep"
let description: String = "Test fixture: completes later than the timeout."
let parameters: JSONValue? = .object(["type": .string("object")])

func execute(argumentsJSON: String) async throws -> String {
await withCheckedContinuation { continuation in
DispatchQueue.global().asyncAfter(deadline: .now() + 1.2) {
continuation.resume()
}
}
return ToolEnvelope.success(tool: name, text: "did not time out")
}
}

/// Tool body that completes well within the test timeout. Used as a
/// happy-path control to confirm the timeout race doesn't fire
/// spuriously on fast tools.
Expand Down Expand Up @@ -75,6 +95,25 @@ struct ToolRegistryTimeoutTests {
#expect(elapsed < 4.0, "took \(elapsed)s — expected <4s if timeout race fired")
}

@Test
func blockingToolReturnsTimeoutWithoutWaitingForBodyToDrain() async throws {
let tool = BlockingSleepTool()
let started = Date()
let result = try await ToolRegistry.runToolBody(
tool,
argumentsJSON: "{}",
timeoutSeconds: 0.1
)
let elapsed = Date().timeIntervalSince(started)

#expect(ToolEnvelope.isError(result))
let data = result.data(using: .utf8)!
let parsed = try JSONSerialization.jsonObject(with: data) as? [String: Any]
#expect(parsed?["kind"] as? String == "timeout")
#expect(parsed?["tool"] as? String == tool.name)
#expect(elapsed < 1.0, "took \(elapsed)s — timeout waited for the blocked body")
}

@Test
func fastToolReturnsItsOwnResultBeforeTimeoutFires() async throws {
let tool = FastEchoTool()
Expand Down
115 changes: 73 additions & 42 deletions Packages/OsaurusCore/Tools/ToolRegistry.swift
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,54 @@
import Foundation
import Combine

private final class ToolBodyTimeoutRaceState: @unchecked Sendable {
private let lock = NSLock()
private var continuation: CheckedContinuation<String, Never>?
private var bodyTask: Task<Void, Never>?
private var timeoutTask: Task<Void, Never>?
private var cancelBodyWhenSet = false
private var cancelTimeoutWhenSet = false

init(continuation: CheckedContinuation<String, Never>) {
self.continuation = continuation
}

func setTasks(body: Task<Void, Never>, timeout: Task<Void, Never>) {
lock.lock()
bodyTask = body
timeoutTask = timeout
let shouldCancelBody = cancelBodyWhenSet
let shouldCancelTimeout = cancelTimeoutWhenSet
lock.unlock()

if shouldCancelBody { body.cancel() }
if shouldCancelTimeout { timeout.cancel() }
}

func finish(with result: String, cancelBody: Bool, cancelTimeout: Bool) {
lock.lock()
guard let continuation else {
lock.unlock()
return
}
self.continuation = nil

let bodyToCancel = cancelBody ? bodyTask : nil
let timeoutToCancel = cancelTimeout ? timeoutTask : nil
if cancelBody, bodyTask == nil {
cancelBodyWhenSet = true
}
if cancelTimeout, timeoutTask == nil {
cancelTimeoutWhenSet = true
}
lock.unlock()

bodyToCancel?.cancel()
timeoutToCancel?.cancel()
continuation.resume(returning: result)
}
}

@MainActor
final class ToolRegistry: ObservableObject {
static let shared = ToolRegistry()
Expand Down Expand Up @@ -338,7 +386,7 @@ final class ToolRegistry: ObservableObject {
/// fall through unchanged: parsing is best-effort, and tool bodies
/// keep their richer `requireXxx` helpers as the second line of
/// defence.
private nonisolated static func preflight(
nonisolated private static func preflight(
argumentsJSON: String,
schema: JSONValue?,
toolName: String
Expand Down Expand Up @@ -392,14 +440,13 @@ final class ToolRegistry: ObservableObject {
/// tests can drive it with a small `timeoutSeconds` value without
/// waiting for the full 120s production budget.
///
/// Each branch of the race converts thrown errors (including
/// `CancellationError` from the loser when we `cancelAll`) into a
/// structured `ToolEnvelope` *inside* its child task. That keeps
/// `withTaskGroup` non-throwing and prevents the cancelled sibling's
/// post-return throw from reaching the caller as the function's
/// error — historically the slow-tool case rethrew CancellationError
/// and stalled while the group drained.
internal nonisolated static func runToolBody(
/// The body and timeout run as unstructured tasks rather than a task
/// group. That is intentional: task-group scope exit drains cancelled
/// children, so a non-cooperative tool body can still delay the timeout
/// response until it returns. The race state resumes the caller once and
/// cancels the loser without waiting for that loser to observe
/// cancellation.
nonisolated static func runToolBody(
_ tool: OsaurusTool,
argumentsJSON: String,
timeoutSeconds: TimeInterval
Expand All @@ -412,49 +459,33 @@ final class ToolRegistry: ObservableObject {
tool: toolName,
retryable: true
)
// Sentinel returned by the cancelled loser branch so the
// consumer loop knows to ignore it. Cannot collide with any
// legitimate envelope because real envelopes are JSON.
let cancelledSentinel = "__osaurus_runToolBody_cancelled__"

return await withTaskGroup(of: String.self) { group in
group.addTask {
return await withCheckedContinuation { continuation in
let race = ToolBodyTimeoutRaceState(continuation: continuation)
let bodyTask = Task {
do {
return try await tool.execute(argumentsJSON: argumentsJSON)
let result = try await tool.execute(argumentsJSON: argumentsJSON)
race.finish(with: result, cancelBody: false, cancelTimeout: true)
} catch is CancellationError {
return cancelledSentinel
// A cooperative loser should not overwrite the timeout
// envelope. If cancellation happened before the timeout
// fired, the timeout task remains responsible for the
// structured result.
return
} catch {
return ToolEnvelope.fromError(error, tool: toolName)
let result = ToolEnvelope.fromError(error, tool: toolName)
race.finish(with: result, cancelBody: false, cancelTimeout: true)
}
}
group.addTask {
let nanos = UInt64(timeoutSeconds * 1_000_000_000)
let timeoutTask = Task {
let nanos = UInt64(max(0, timeoutSeconds) * 1_000_000_000)
do {
try await Task.sleep(nanoseconds: nanos)
} catch {
// Cancelled because the body finished first — yield
// the sentinel so the caller's first non-sentinel
// result wins.
return cancelledSentinel
return
}
return timeoutEnvelope
}

// The first non-sentinel result is the winner; cancel the
// sibling and let `withTaskGroup` auto-drain on closure
// return. The drain is safe because every child branch
// converts its own errors into envelope strings — there
// are no uncaught throws to surface.
for await result in group {
if result == cancelledSentinel { continue }
group.cancelAll()
return result
race.finish(with: timeoutEnvelope, cancelBody: true, cancelTimeout: false)
}
return ToolEnvelope.failure(
kind: .executionError,
message: "Tool '\(toolName)' produced no result.",
tool: toolName
)
race.setTasks(body: bodyTask, timeout: timeoutTask)
}
}

Expand Down
Loading