fix(tools): handle cancellation and ETXTBSY retries

SamSaffron · SamSaffron · commit ef43b76f8d25 · 2026-06-10T12:43:19.000+10:00
Ensure cancelled tool-call turns still return one synthesized result per
announced call so strict providers can resume conversations cleanly.
Also retry transient ETXTBSY failures when launching custom or agent
scripts, which can happen when freshly written scripts are executed
immediately while other processes are spawning.
diff --git a/internal/llm/engine.go b/internal/llm/engine.go
@@ -2582,8 +2582,11 @@ func buildAssistantMessageWithReasoningMetadata(text string, toolCalls []ToolCal
 // may arrive in non-deterministic order. Consumers should use ToolCallID to correlate
 // start/end events rather than relying on ordering.
 func (e *Engine) executeToolCalls(ctx context.Context, calls []ToolCall, parallel bool, send eventSender, debug bool, debugRaw bool) ([]Message, error) {
+	// Cancellation must still yield a result message for every announced call:
+	// the caller persists the assistant message with its tool calls, and a turn
+	// with dangling tool calls breaks conversation resume on strict providers.
 	if err := ctx.Err(); err != nil {
-		return nil, err
+		return cancelledToolCallMessages(calls, err), nil
 	}
 
 	// Fast path: single call, no concurrency overhead
@@ -2593,9 +2596,9 @@ func (e *Engine) executeToolCalls(ctx context.Context, calls []ToolCall, paralle
 
 	if !parallel {
 		results := make([]Message, 0, len(calls))
-		for _, call := range calls {
+		for i, call := range calls {
 			if err := ctx.Err(); err != nil {
-				return nil, err
+				return append(results, cancelledToolCallMessages(calls[i:], err)...), nil
 			}
 			msgs, err := e.executeSingleToolCallSafe(ctx, call, send, debug, debugRaw)
 			if err != nil {
@@ -2659,13 +2662,43 @@ func (e *Engine) executeToolCalls(ctx context.Context, calls []ToolCall, paralle
 		case r := <-resultChan:
 			results[r.index] = r.message
 		case <-ctx.Done():
-			return nil, ctx.Err()
+			// Keep any results that finished before cancellation, then
+			// synthesize cancelled results for the rest so every announced
+			// call stays paired with a result in the persisted turn.
+			for drained := false; !drained; {
+				select {
+				case r := <-resultChan:
+					results[r.index] = r.message
+				default:
+					drained = true
+				}
+			}
+			for i := range results {
+				if results[i].Role == "" {
+					results[i] = cancelledToolCallMessage(calls[i], ctx.Err())
+				}
+			}
+			return results, nil
 		}
 	}
 
 	return results, nil
 }
 
+// cancelledToolCallMessage synthesizes the error result for a tool call that
+// was skipped or abandoned because the context was cancelled.
+func cancelledToolCallMessage(call ToolCall, err error) Message {
+	return ToolErrorMessage(call.ID, call.Name, fmt.Sprintf("Error: %v", err), call.ThoughtSig)
+}
+
+func cancelledToolCallMessages(calls []ToolCall, err error) []Message {
+	msgs := make([]Message, 0, len(calls))
+	for _, call := range calls {
+		msgs = append(msgs, cancelledToolCallMessage(call, err))
+	}
+	return msgs
+}
+
 // executeSingleToolCallSafe wraps executeSingleToolCall with panic recovery.
 func (e *Engine) executeSingleToolCallSafe(ctx context.Context, call ToolCall, send eventSender, debug bool, debugRaw bool) (msgs []Message, err error) {
 	defer func() {
diff --git a/internal/llm/engine_test.go b/internal/llm/engine_test.go
@@ -1270,16 +1270,31 @@ func TestExecuteToolCallsParallelReturnsOnContextCancel(t *testing.T) {
 	}
 
 	start := time.Now()
-	_, err := engine.executeToolCalls(ctx, calls, true, eventSender{}, false, false)
+	results, err := engine.executeToolCalls(ctx, calls, true, eventSender{}, false, false)
 	elapsed := time.Since(start)
 	t.Logf("parallel tool execution returned after cancellation in %v", elapsed)
 
-	if !errors.Is(err, context.Canceled) {
-		t.Fatalf("executeToolCalls error = %v, want context.Canceled", err)
+	if err != nil {
+		t.Fatalf("executeToolCalls error = %v, want synthesized results on cancellation", err)
 	}
 	if elapsed >= 200*time.Millisecond {
 		t.Fatalf("executeToolCalls returned after %v; want prompt return on cancellation", elapsed)
 	}
+	if len(results) != len(calls) {
+		t.Fatalf("results = %d, want one per announced call (%d)", len(results), len(calls))
+	}
+	for i, msg := range results {
+		if len(msg.Parts) == 0 || msg.Parts[0].ToolResult == nil {
+			t.Fatalf("result %d has no tool result part: %#v", i, msg)
+		}
+		tr := msg.Parts[0].ToolResult
+		if tr.ID != calls[i].ID {
+			t.Fatalf("result %d ID = %q, want %q", i, tr.ID, calls[i].ID)
+		}
+		if !tr.IsError || !strings.Contains(tr.Content, context.Canceled.Error()) {
+			t.Fatalf("result %d = %+v, want cancellation error result", i, tr)
+		}
+	}
 }
 
 // namedTool is a simple tool with a configurable name for testing
diff --git a/internal/tools/custom_tool.go b/internal/tools/custom_tool.go
@@ -103,34 +103,40 @@ func (t *CustomScriptTool) Execute(ctx context.Context, args json.RawMessage) (l
 	execCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
 	defer cancel()
 
-	// Build the command according to the calling convention.
-	cmd, err := t.buildCommand(execCtx, scriptPath, args)
-	if err != nil {
-		return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "build command: %v", err))), nil
-	}
-	cmd.Dir = workDir
-
 	// Build environment: inherit + agent-specific vars + per-tool env
 	env := os.Environ()
 	env = append(env, fmt.Sprintf("TERM_LLM_AGENT_DIR=%s", t.agentDir))
 	env = append(env, fmt.Sprintf("TERM_LLM_TOOL_NAME=%s", t.def.Name))
 	for k, v := range t.def.Env {
 		env = append(env, fmt.Sprintf("%s=%s", k, v))
 	}
-	cmd.Env = env
-
-	cleanup, prepErr := prepareToolCommand(cmd)
-	if prepErr != nil {
-		return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
-	}
-	defer cleanup()
 
 	stdout := newLimitedBuffer(t.limits.MaxBytes)
 	stderr := newLimitedBuffer(t.limits.MaxBytes)
-	cmd.Stdout = stdout
-	cmd.Stderr = stderr
 
-	execErr := cmd.Run()
+	var execErr error
+	for attempt := 0; ; attempt++ {
+		// Build the command according to the calling convention.
+		cmd, err := t.buildCommand(execCtx, scriptPath, args)
+		if err != nil {
+			return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "build command: %v", err))), nil
+		}
+		cmd.Dir = workDir
+		cmd.Env = append([]string(nil), env...)
+		cmd.Stdout = stdout
+		cmd.Stderr = stderr
+
+		cleanup, prepErr := prepareToolCommand(cmd)
+		if prepErr != nil {
+			return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
+		}
+
+		execErr = cmd.Run()
+		cleanup()
+		if !retryScriptBusy(execCtx, execErr, attempt) {
+			break
+		}
+	}
 
 	result := ShellResult{
 		Stdout:          stdout.String(),
diff --git a/internal/tools/custom_tool_test.go b/internal/tools/custom_tool_test.go
@@ -305,7 +305,7 @@ func TestCustomScriptTool_TimeoutKillsGrandchildren(t *testing.T) {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if !out.TimedOut {
-		t.Fatal("expected timeout for grandchild-holding script")
+		t.Fatalf("expected timeout for grandchild-holding script, got: %s", out.Content)
 	}
 }
 
diff --git a/internal/tools/exec_helpers.go b/internal/tools/exec_helpers.go
@@ -2,8 +2,10 @@ package tools
 
 import (
 	"bytes"
+	"context"
 	"crypto/rand"
 	"encoding/hex"
+	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -93,6 +95,21 @@ func prepareToolCommand(cmd *exec.Cmd) (func(), error) {
 	return cleanup, nil
 }
 
+const scriptBusyMaxRetries = 4
+
+// retryScriptBusy reports whether a failed script exec should be retried,
+// sleeping briefly before returning true. ETXTBSY is transient: a fork from a
+// concurrent goroutine can inherit the just-written script's writable file
+// descriptor and hold it across our exec, e.g. when a script is written and
+// immediately executed while other commands are being spawned.
+func retryScriptBusy(ctx context.Context, err error, attempt int) bool {
+	if attempt >= scriptBusyMaxRetries || ctx.Err() != nil || !errors.Is(err, syscall.ETXTBSY) {
+		return false
+	}
+	time.Sleep(10 * time.Millisecond)
+	return true
+}
+
 // tagCommandWithNonce adds a unique TERMLLM_SHELL_NONCE=<hex> entry to cmd.Env
 // so descendants can be identified later by scanning /proc. Returns the nonce
 // value, or "" if it could not be generated (in which case tagging is silently
diff --git a/internal/tools/run_agent_script.go b/internal/tools/run_agent_script.go
@@ -166,21 +166,27 @@ func (t *RunAgentScriptTool) Execute(ctx context.Context, args json.RawMessage)
 	execCtx, cancel := context.WithTimeout(ctx, time.Duration(timeout)*time.Second)
 	defer cancel()
 
-	cmd := exec.CommandContext(execCtx, realScript, argv...)
-	cmd.Dir = workDir
-
-	cleanup, prepErr := prepareToolCommand(cmd)
-	if prepErr != nil {
-		return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
-	}
-	defer cleanup()
-
 	stdout := newLimitedBuffer(t.limits.MaxBytes)
 	stderr := newLimitedBuffer(t.limits.MaxBytes)
-	cmd.Stdout = stdout
-	cmd.Stderr = stderr
 
-	execErr := cmd.Run()
+	var execErr error
+	for attempt := 0; ; attempt++ {
+		cmd := exec.CommandContext(execCtx, realScript, argv...)
+		cmd.Dir = workDir
+		cmd.Stdout = stdout
+		cmd.Stderr = stderr
+
+		cleanup, prepErr := prepareToolCommand(cmd)
+		if prepErr != nil {
+			return llm.TextOutput(formatToolError(NewToolErrorf(ErrExecutionFailed, "script setup error: %v", prepErr))), nil
+		}
+
+		execErr = cmd.Run()
+		cleanup()
+		if !retryScriptBusy(execCtx, execErr, attempt) {
+			break
+		}
+	}
 
 	result := ShellResult{
 		Stdout:          stdout.String(),
diff --git a/internal/tools/run_agent_script_test.go b/internal/tools/run_agent_script_test.go
@@ -243,7 +243,7 @@ func TestRunAgentScriptTool_TimeoutKillsGrandchildren(t *testing.T) {
 		t.Fatalf("Execute returned error: %v", err)
 	}
 	if !output.TimedOut {
-		t.Fatal("expected timeout for grandchild-holding script")
+		t.Fatalf("expected timeout for grandchild-holding script, got: %s", output.Content)
 	}
 }
 

Original file line number	Diff line number	Diff line change
`@@ -305,7 +305,7 @@ func TestCustomScriptTool_TimeoutKillsGrandchildren(t *testing.T) {`
`305`	`305`	`t.Fatalf("unexpected error: %v", err)`
`306`	`306`	`}`
`307`	`307`	`if !out.TimedOut {`
`308`		`- t.Fatal("expected timeout for grandchild-holding script")`
	`308`	`+ t.Fatalf("expected timeout for grandchild-holding script, got: %s", out.Content)`
`309`	`309`	`}`
`310`	`310`	`}`
`311`	`311`
Original file line number	Diff line number	Diff line change
`@@ -243,7 +243,7 @@ func TestRunAgentScriptTool_TimeoutKillsGrandchildren(t *testing.T) {`
`243`	`243`	`t.Fatalf("Execute returned error: %v", err)`
`244`	`244`	`}`
`245`	`245`	`if !output.TimedOut {`
`246`		`- t.Fatal("expected timeout for grandchild-holding script")`
	`246`	`+ t.Fatalf("expected timeout for grandchild-holding script, got: %s", output.Content)`
`247`	`247`	`}`
`248`	`248`	`}`
`249`	`249`