Skip to content

Commit ef43b76

Browse files
committed
fix(tools): handle cancellation and ETXTBSY retries
Ensure cancelled tool-call turns still return one synthesized result per announced call so strict providers can resume conversations cleanly. Also retry transient ETXTBSY failures when launching custom or agent scripts, which can happen when freshly written scripts are executed immediately while other processes are spawning.
1 parent a015bd0 commit ef43b76

7 files changed

Lines changed: 115 additions & 38 deletions

File tree

internal/llm/engine.go

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2582,8 +2582,11 @@ func buildAssistantMessageWithReasoningMetadata(text string, toolCalls []ToolCal
25822582
// may arrive in non-deterministic order. Consumers should use ToolCallID to correlate
25832583
// start/end events rather than relying on ordering.
25842584
func (e *Engine) executeToolCalls(ctx context.Context, calls []ToolCall, parallel bool, send eventSender, debug bool, debugRaw bool) ([]Message, error) {
2585+
// Cancellation must still yield a result message for every announced call:
2586+
// the caller persists the assistant message with its tool calls, and a turn
2587+
// with dangling tool calls breaks conversation resume on strict providers.
25852588
if err := ctx.Err(); err != nil {
2586-
return nil, err
2589+
return cancelledToolCallMessages(calls, err), nil
25872590
}
25882591

25892592
// Fast path: single call, no concurrency overhead
@@ -2593,9 +2596,9 @@ func (e *Engine) executeToolCalls(ctx context.Context, calls []ToolCall, paralle
25932596

25942597
if !parallel {
25952598
results := make([]Message, 0, len(calls))
2596-
for _, call := range calls {
2599+
for i, call := range calls {
25972600
if err := ctx.Err(); err != nil {
2598-
return nil, err
2601+
return append(results, cancelledToolCallMessages(calls[i:], err)...), nil
25992602
}
26002603
msgs, err := e.executeSingleToolCallSafe(ctx, call, send, debug, debugRaw)
26012604
if err != nil {
@@ -2659,13 +2662,43 @@ func (e *Engine) executeToolCalls(ctx context.Context, calls []ToolCall, paralle
26592662
case r := <-resultChan:
26602663
results[r.index] = r.message
26612664
case <-ctx.Done():
2662-
return nil, ctx.Err()
2665+
// Keep any results that finished before cancellation, then
2666+
// synthesize cancelled results for the rest so every announced
2667+
// call stays paired with a result in the persisted turn.
2668+
for drained := false; !drained; {
2669+
select {
2670+
case r := <-resultChan:
2671+
results[r.index] = r.message
2672+
default:
2673+
drained = true
2674+
}
2675+
}
2676+
for i := range results {
2677+
if results[i].Role == "" {
2678+
results[i] = cancelledToolCallMessage(calls[i], ctx.Err())
2679+
}
2680+
}
2681+
return results, nil
26632682
}
26642683
}
26652684

26662685
return results, nil
26672686
}
26682687

2688+
// cancelledToolCallMessage synthesizes the error result for a tool call that
2689+
// was skipped or abandoned because the context was cancelled.
2690+
func cancelledToolCallMessage(call ToolCall, err error) Message {
2691+
return ToolErrorMessage(call.ID, call.Name, fmt.Sprintf("Error: %v", err), call.ThoughtSig)
2692+
}
2693+
2694+
func cancelledToolCallMessages(calls []ToolCall, err error) []Message {
2695+
msgs := make([]Message, 0, len(calls))
2696+
for _, call := range calls {
2697+
msgs = append(msgs, cancelledToolCallMessage(call, err))
2698+
}
2699+
return msgs
2700+
}
2701+
26692702
// executeSingleToolCallSafe wraps executeSingleToolCall with panic recovery.
26702703
func (e *Engine) executeSingleToolCallSafe(ctx context.Context, call ToolCall, send eventSender, debug bool, debugRaw bool) (msgs []Message, err error) {
26712704
defer func() {

internal/llm/engine_test.go

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,16 +1270,31 @@ func TestExecuteToolCallsParallelReturnsOnContextCancel(t *testing.T) {
12701270
}
12711271

12721272
start := time.Now()
1273-
_, err := engine.executeToolCalls(ctx, calls, true, eventSender{}, false, false)
1273+
results, err := engine.executeToolCalls(ctx, calls, true, eventSender{}, false, false)
12741274
elapsed := time.Since(start)
12751275
t.Logf("parallel tool execution returned after cancellation in %v", elapsed)
12761276

1277-
if !errors.Is(err, context.Canceled) {
1278-
t.Fatalf("executeToolCalls error = %v, want context.Canceled", err)
1277+
if err != nil {
1278+
t.Fatalf("executeToolCalls error = %v, want synthesized results on cancellation", err)
12791279
}
12801280
if elapsed >= 200*time.Millisecond {
12811281
t.Fatalf("executeToolCalls returned after %v; want prompt return on cancellation", elapsed)
12821282
}
1283+
if len(results) != len(calls) {
1284+
t.Fatalf("results = %d, want one per announced call (%d)", len(results), len(calls))
1285+
}
1286+
for i, msg := range results {
1287+
if len(msg.Parts) == 0 || msg.Parts[0].ToolResult == nil {
1288+
t.Fatalf("result %d has no tool result part: %#v", i, msg)
1289+
}
1290+
tr := msg.Parts[0].ToolResult
1291+
if tr.ID != calls[i].ID {
1292+
t.Fatalf("result %d ID = %q, want %q", i, tr.ID, calls[i].ID)
1293+
}
1294+
if !tr.IsError || !strings.Contains(tr.Content, context.Canceled.Error()) {
1295+
t.Fatalf("result %d = %+v, want cancellation error result", i, tr)
1296+
}
1297+
}
12831298
}
12841299

12851300
// namedTool is a simple tool with a configurable name for testing

internal/tools/custom_tool.go

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -103,34 +103,40 @@ func (t *CustomScriptTool) Execute(ctx context.Context, args json.RawMessage) (l
103103
execCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
104104
defer cancel()
105105

106-
// Build the command according to the calling convention.
107-
cmd, err := t.buildCommand(execCtx, scriptPath, args)
108-
if err != nil {
109-
return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "build command: %v", err))), nil
110-
}
111-
cmd.Dir = workDir
112-
113106
// Build environment: inherit + agent-specific vars + per-tool env
114107
env := os.Environ()
115108
env = append(env, fmt.Sprintf("TERM_LLM_AGENT_DIR=%s", t.agentDir))
116109
env = append(env, fmt.Sprintf("TERM_LLM_TOOL_NAME=%s", t.def.Name))
117110
for k, v := range t.def.Env {
118111
env = append(env, fmt.Sprintf("%s=%s", k, v))
119112
}
120-
cmd.Env = env
121-
122-
cleanup, prepErr := prepareToolCommand(cmd)
123-
if prepErr != nil {
124-
return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
125-
}
126-
defer cleanup()
127113

128114
stdout := newLimitedBuffer(t.limits.MaxBytes)
129115
stderr := newLimitedBuffer(t.limits.MaxBytes)
130-
cmd.Stdout = stdout
131-
cmd.Stderr = stderr
132116

133-
execErr := cmd.Run()
117+
var execErr error
118+
for attempt := 0; ; attempt++ {
119+
// Build the command according to the calling convention.
120+
cmd, err := t.buildCommand(execCtx, scriptPath, args)
121+
if err != nil {
122+
return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "build command: %v", err))), nil
123+
}
124+
cmd.Dir = workDir
125+
cmd.Env = append([]string(nil), env...)
126+
cmd.Stdout = stdout
127+
cmd.Stderr = stderr
128+
129+
cleanup, prepErr := prepareToolCommand(cmd)
130+
if prepErr != nil {
131+
return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
132+
}
133+
134+
execErr = cmd.Run()
135+
cleanup()
136+
if !retryScriptBusy(execCtx, execErr, attempt) {
137+
break
138+
}
139+
}
134140

135141
result := ShellResult{
136142
Stdout: stdout.String(),

internal/tools/custom_tool_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ func TestCustomScriptTool_TimeoutKillsGrandchildren(t *testing.T) {
305305
t.Fatalf("unexpected error: %v", err)
306306
}
307307
if !out.TimedOut {
308-
t.Fatal("expected timeout for grandchild-holding script")
308+
t.Fatalf("expected timeout for grandchild-holding script, got: %s", out.Content)
309309
}
310310
}
311311

internal/tools/exec_helpers.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ package tools
22

33
import (
44
"bytes"
5+
"context"
56
"crypto/rand"
67
"encoding/hex"
8+
"errors"
79
"fmt"
810
"os"
911
"os/exec"
@@ -93,6 +95,21 @@ func prepareToolCommand(cmd *exec.Cmd) (func(), error) {
9395
return cleanup, nil
9496
}
9597

98+
const scriptBusyMaxRetries = 4
99+
100+
// retryScriptBusy reports whether a failed script exec should be retried,
101+
// sleeping briefly before returning true. ETXTBSY is transient: a fork from a
102+
// concurrent goroutine can inherit the just-written script's writable file
103+
// descriptor and hold it across our exec, e.g. when a script is written and
104+
// immediately executed while other commands are being spawned.
105+
func retryScriptBusy(ctx context.Context, err error, attempt int) bool {
106+
if attempt >= scriptBusyMaxRetries || ctx.Err() != nil || !errors.Is(err, syscall.ETXTBSY) {
107+
return false
108+
}
109+
time.Sleep(10 * time.Millisecond)
110+
return true
111+
}
112+
96113
// tagCommandWithNonce adds a unique TERMLLM_SHELL_NONCE=<hex> entry to cmd.Env
97114
// so descendants can be identified later by scanning /proc. Returns the nonce
98115
// value, or "" if it could not be generated (in which case tagging is silently

internal/tools/run_agent_script.go

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -166,21 +166,27 @@ func (t *RunAgentScriptTool) Execute(ctx context.Context, args json.RawMessage)
166166
execCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
167167
defer cancel()
168168

169-
cmd := exec.CommandContext(execCtx, realScript, argv...)
170-
cmd.Dir = workDir
171-
172-
cleanup, prepErr := prepareToolCommand(cmd)
173-
if prepErr != nil {
174-
return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
175-
}
176-
defer cleanup()
177-
178169
stdout := newLimitedBuffer(t.limits.MaxBytes)
179170
stderr := newLimitedBuffer(t.limits.MaxBytes)
180-
cmd.Stdout = stdout
181-
cmd.Stderr = stderr
182171

183-
execErr := cmd.Run()
172+
var execErr error
173+
for attempt := 0; ; attempt++ {
174+
cmd := exec.CommandContext(execCtx, realScript, argv...)
175+
cmd.Dir = workDir
176+
cmd.Stdout = stdout
177+
cmd.Stderr = stderr
178+
179+
cleanup, prepErr := prepareToolCommand(cmd)
180+
if prepErr != nil {
181+
return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
182+
}
183+
184+
execErr = cmd.Run()
185+
cleanup()
186+
if !retryScriptBusy(execCtx, execErr, attempt) {
187+
break
188+
}
189+
}
184190

185191
result := ShellResult{
186192
Stdout: stdout.String(),

internal/tools/run_agent_script_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ func TestRunAgentScriptTool_TimeoutKillsGrandchildren(t *testing.T) {
243243
t.Fatalf("Execute returned error: %v", err)
244244
}
245245
if !output.TimedOut {
246-
t.Fatal("expected timeout for grandchild-holding script")
246+
t.Fatalf("expected timeout for grandchild-holding script, got: %s", output.Content)
247247
}
248248
}
249249

0 commit comments

Comments
 (0)