Skip to content

Commit 06138fb

Browse files
Michael MedingMichael Meding
authored andcommitted
Return tool timeouts without draining blocked bodies
1 parent bf2ff85 commit 06138fb

2 files changed

Lines changed: 111 additions & 42 deletions

File tree

Packages/OsaurusCore/Tests/Tool/ToolRegistryTimeoutTests.swift

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,25 @@ struct ToolRegistryTimeoutTests {
3131
}
3232
}
3333

34+
/// Tool body that ignores cooperative Swift cancellation. This mirrors
35+
/// process / blocking I/O classes where returning a timeout envelope must
36+
/// not wait for the losing branch to drain.
37+
private struct BlockingSleepTool: OsaurusTool {
38+
let name: String = "test_blocking_sleep"
39+
let description: String = "Test fixture: blocks the thread longer than the timeout."
40+
let parameters: JSONValue? = .object(["type": .string("object")])
41+
42+
func execute(argumentsJSON: String) async throws -> String {
43+
let deadline = Date().timeIntervalSinceReferenceDate + 1.2
44+
var spin = 0
45+
while Date().timeIntervalSinceReferenceDate < deadline {
46+
spin &+= 1
47+
}
48+
_ = spin
49+
return ToolEnvelope.success(tool: name, text: "did not time out")
50+
}
51+
}
52+
3453
/// Tool body that completes well within the test timeout. Used as a
3554
/// happy-path control to confirm the timeout race doesn't fire
3655
/// spuriously on fast tools.
@@ -75,6 +94,25 @@ struct ToolRegistryTimeoutTests {
7594
#expect(elapsed < 4.0, "took \(elapsed)s — expected <4s if timeout race fired")
7695
}
7796

97+
@Test
98+
func blockingToolReturnsTimeoutWithoutWaitingForBodyToDrain() async throws {
99+
let tool = BlockingSleepTool()
100+
let started = Date()
101+
let result = try await ToolRegistry.runToolBody(
102+
tool,
103+
argumentsJSON: "{}",
104+
timeoutSeconds: 0.1
105+
)
106+
let elapsed = Date().timeIntervalSince(started)
107+
108+
#expect(ToolEnvelope.isError(result))
109+
let data = result.data(using: .utf8)!
110+
let parsed = try JSONSerialization.jsonObject(with: data) as? [String: Any]
111+
#expect(parsed?["kind"] as? String == "timeout")
112+
#expect(parsed?["tool"] as? String == tool.name)
113+
#expect(elapsed < 1.0, "took \(elapsed)s — timeout waited for the blocked body")
114+
}
115+
78116
@Test
79117
func fastToolReturnsItsOwnResultBeforeTimeoutFires() async throws {
80118
let tool = FastEchoTool()

Packages/OsaurusCore/Tools/ToolRegistry.swift

Lines changed: 73 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,54 @@
88
import Foundation
99
import Combine
1010

11+
private final class ToolBodyTimeoutRaceState: @unchecked Sendable {
12+
private let lock = NSLock()
13+
private var continuation: CheckedContinuation<String, Never>?
14+
private var bodyTask: Task<Void, Never>?
15+
private var timeoutTask: Task<Void, Never>?
16+
private var cancelBodyWhenSet = false
17+
private var cancelTimeoutWhenSet = false
18+
19+
init(continuation: CheckedContinuation<String, Never>) {
20+
self.continuation = continuation
21+
}
22+
23+
func setTasks(body: Task<Void, Never>, timeout: Task<Void, Never>) {
24+
lock.lock()
25+
bodyTask = body
26+
timeoutTask = timeout
27+
let shouldCancelBody = cancelBodyWhenSet
28+
let shouldCancelTimeout = cancelTimeoutWhenSet
29+
lock.unlock()
30+
31+
if shouldCancelBody { body.cancel() }
32+
if shouldCancelTimeout { timeout.cancel() }
33+
}
34+
35+
func finish(with result: String, cancelBody: Bool, cancelTimeout: Bool) {
36+
lock.lock()
37+
guard let continuation else {
38+
lock.unlock()
39+
return
40+
}
41+
self.continuation = nil
42+
43+
let bodyToCancel = cancelBody ? bodyTask : nil
44+
let timeoutToCancel = cancelTimeout ? timeoutTask : nil
45+
if cancelBody, bodyTask == nil {
46+
cancelBodyWhenSet = true
47+
}
48+
if cancelTimeout, timeoutTask == nil {
49+
cancelTimeoutWhenSet = true
50+
}
51+
lock.unlock()
52+
53+
bodyToCancel?.cancel()
54+
timeoutToCancel?.cancel()
55+
continuation.resume(returning: result)
56+
}
57+
}
58+
1159
@MainActor
1260
final class ToolRegistry: ObservableObject {
1361
static let shared = ToolRegistry()
@@ -334,7 +382,7 @@ final class ToolRegistry: ObservableObject {
334382
/// fall through unchanged: parsing is best-effort, and tool bodies
335383
/// keep their richer `requireXxx` helpers as the second line of
336384
/// defence.
337-
private nonisolated static func preflight(
385+
nonisolated private static func preflight(
338386
argumentsJSON: String,
339387
schema: JSONValue?,
340388
toolName: String
@@ -388,14 +436,13 @@ final class ToolRegistry: ObservableObject {
388436
/// tests can drive it with a small `timeoutSeconds` value without
389437
/// waiting for the full 120s production budget.
390438
///
391-
/// Each branch of the race converts thrown errors (including
392-
/// `CancellationError` from the loser when we `cancelAll`) into a
393-
/// structured `ToolEnvelope` *inside* its child task. That keeps
394-
/// `withTaskGroup` non-throwing and prevents the cancelled sibling's
395-
/// post-return throw from reaching the caller as the function's
396-
/// error — historically the slow-tool case rethrew CancellationError
397-
/// and stalled while the group drained.
398-
internal nonisolated static func runToolBody(
439+
/// The body and timeout run as unstructured tasks rather than a task
440+
/// group. That is intentional: task-group scope exit drains cancelled
441+
/// children, so a non-cooperative tool body can still delay the timeout
442+
/// response until it returns. The race state resumes the caller once and
443+
/// cancels the loser without waiting for that loser to observe
444+
/// cancellation.
445+
nonisolated static func runToolBody(
399446
_ tool: OsaurusTool,
400447
argumentsJSON: String,
401448
timeoutSeconds: TimeInterval
@@ -408,49 +455,33 @@ final class ToolRegistry: ObservableObject {
408455
tool: toolName,
409456
retryable: true
410457
)
411-
// Sentinel returned by the cancelled loser branch so the
412-
// consumer loop knows to ignore it. Cannot collide with any
413-
// legitimate envelope because real envelopes are JSON.
414-
let cancelledSentinel = "__osaurus_runToolBody_cancelled__"
415-
416-
return await withTaskGroup(of: String.self) { group in
417-
group.addTask {
458+
return await withCheckedContinuation { continuation in
459+
let race = ToolBodyTimeoutRaceState(continuation: continuation)
460+
let bodyTask = Task {
418461
do {
419-
return try await tool.execute(argumentsJSON: argumentsJSON)
462+
let result = try await tool.execute(argumentsJSON: argumentsJSON)
463+
race.finish(with: result, cancelBody: false, cancelTimeout: true)
420464
} catch is CancellationError {
421-
return cancelledSentinel
465+
// A cooperative loser should not overwrite the timeout
466+
// envelope. If cancellation happened before the timeout
467+
// fired, the timeout task remains responsible for the
468+
// structured result.
469+
return
422470
} catch {
423-
return ToolEnvelope.fromError(error, tool: toolName)
471+
let result = ToolEnvelope.fromError(error, tool: toolName)
472+
race.finish(with: result, cancelBody: false, cancelTimeout: true)
424473
}
425474
}
426-
group.addTask {
427-
let nanos = UInt64(timeoutSeconds * 1_000_000_000)
475+
let timeoutTask = Task {
476+
let nanos = UInt64(max(0, timeoutSeconds) * 1_000_000_000)
428477
do {
429478
try await Task.sleep(nanoseconds: nanos)
430479
} catch {
431-
// Cancelled because the body finished first — yield
432-
// the sentinel so the caller's first non-sentinel
433-
// result wins.
434-
return cancelledSentinel
480+
return
435481
}
436-
return timeoutEnvelope
437-
}
438-
439-
// The first non-sentinel result is the winner; cancel the
440-
// sibling and let `withTaskGroup` auto-drain on closure
441-
// return. The drain is safe because every child branch
442-
// converts its own errors into envelope strings — there
443-
// are no uncaught throws to surface.
444-
for await result in group {
445-
if result == cancelledSentinel { continue }
446-
group.cancelAll()
447-
return result
482+
race.finish(with: timeoutEnvelope, cancelBody: true, cancelTimeout: false)
448483
}
449-
return ToolEnvelope.failure(
450-
kind: .executionError,
451-
message: "Tool '\(toolName)' produced no result.",
452-
tool: toolName
453-
)
484+
race.setTasks(body: bodyTask, timeout: timeoutTask)
454485
}
455486
}
456487

0 commit comments

Comments
 (0)