Skip to content

Commit e183c94

Browse files
authored
Merge pull request docker#2121 from dgageot/board/our-management-of-thinking-is-a-mess-and-bc64245d
Rework thinking budget: opt-in by default, adaptive thinking, effort levels
2 parents 2aa5a46 + 0b7d92e commit e183c94

File tree

11 files changed

+688
-1351
lines changed

11 files changed

+688
-1351
lines changed

agent-schema.json

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@
540540
"description": "Whether to track usage"
541541
},
542542
"thinking_budget": {
543-
"description": "Controls reasoning effort/budget. Use 'none' or 0 to disable thinking. OpenAI: string levels ('minimal','low','medium','high'), default 'medium'. Anthropic: integer token budget (1024-32768), default 8192. Amazon Bedrock (Claude): same as Anthropic. Google Gemini 2.5: integer token budget (-1 for dynamic, 0 to disable, 24576 max), default -1. Google Gemini 3: string levels ('minimal' Flash only,'low','medium','high'), default 'high' for Pro, 'medium' for Flash.",
543+
"description": "Controls reasoning effort/budget. Use 'none' or 0 to disable thinking. OpenAI: string levels ('minimal','low','medium','high'). Anthropic: integer token budget (1024-32768), 'adaptive' (lets the model decide), or effort levels ('low','medium','high','max') which use adaptive thinking with the given effort. Amazon Bedrock (Claude): integer token budget or effort levels ('low','medium','high') mapped to token budgets. Google Gemini 2.5: integer token budget (-1 for dynamic, 0 to disable, 24576 max). Google Gemini 3: string levels ('minimal' Flash only,'low','medium','high'). Thinking is only enabled when explicitly configured.",
544544
"oneOf": [
545545
{
546546
"type": "string",
@@ -549,9 +549,11 @@
549549
"minimal",
550550
"low",
551551
"medium",
552-
"high"
552+
"high",
553+
"max",
554+
"adaptive"
553555
],
554-
"description": "Reasoning effort level (OpenAI, Gemini 3). Use 'none' to disable thinking."
556+
"description": "Reasoning effort level. 'adaptive'/'max' are Anthropic-specific. Use 'none' to disable thinking."
555557
},
556558
{
557559
"type": "integer",
@@ -567,6 +569,8 @@
567569
"low",
568570
"medium",
569571
"high",
572+
"max",
573+
"adaptive",
570574
-1,
571575
1024,
572576
8192,

examples/thinking_budget.yaml

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
agents:
77
root:
88
model: gpt-5-mini-min # <- try with gpt-5-mini-high
9-
# model: claude-4-5-sonnet-min # <- try with claude-4-5-sonnet-high
9+
# model: claude-4-5-sonnet-min # <- try with claude-4-5-sonnet-high or claude-opus-4-6-adaptive
1010
# model: gemini-2-5-flash-dynamic-thinking # <- try with -no-thinking, -low or -high variants
1111
description: a helpful assistant that thinks
1212
instruction: you are a helpful assistant who can also use tools, but only if you need to
@@ -29,15 +29,25 @@ models:
2929
claude-4-5-sonnet-min:
3030
provider: anthropic
3131
model: claude-sonnet-4-5-20250929
32-
thinking_budget: 1024 # <- tokens, 1024 is the minimum
32+
thinking_budget: 1024 # <- explicit token budget (1024-32768) for older models
3333

3434
claude-4-5-sonnet-high:
3535
provider: anthropic
3636
model: claude-sonnet-4-5-20250929
37-
thinking_budget: 32768 # <- tokens, 32768 is the Anthropic suggested maximum without batching
37+
thinking_budget: 32768 # <- explicit token budget (32768 is the Anthropic suggested maximum)
3838
provider_opts:
3939
interleaved_thinking: true # <- enables interleaved thinking, aka tool calling during model reasoning
4040

41+
claude-opus-4-6-adaptive:
42+
provider: anthropic
43+
model: claude-opus-4-6
44+
thinking_budget: adaptive # <- lets the model decide when and how much to think (recommended for 4.6)
45+
46+
claude-opus-4-6-low:
47+
provider: anthropic
48+
model: claude-opus-4-6
49+
thinking_budget: low # <- adaptive thinking with low effort: "low", "medium", "high", "max"
50+
4151
gemini-2-5-flash-dynamic-thinking:
4252
provider: google
4353
model: gemini-2.5-flash

pkg/config/latest/types.go

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -398,7 +398,10 @@ type ModelConfig struct {
398398
TrackUsage *bool `json:"track_usage,omitempty"`
399399
// ThinkingBudget controls reasoning effort/budget:
400400
// - For OpenAI: accepts string levels "minimal", "low", "medium", "high"
401-
// - For Anthropic: accepts integer token budget (1024-32000)
401+
// - For Anthropic: accepts integer token budget (1024-32000), "adaptive",
402+
// or string levels "low", "medium", "high", "max" (uses adaptive thinking with effort)
403+
// - For Bedrock Claude: accepts integer token budget or string levels
404+
// "minimal", "low", "medium", "high" (mapped to token budgets via EffortTokens)
402405
// - For other providers: may be ignored
403406
ThinkingBudget *ThinkingBudget `json:"thinking_budget,omitempty"`
404407
// Routing defines rules for routing requests to different models.
@@ -671,6 +674,7 @@ func (d DeferConfig) MarshalYAML() (any, error) {
671674
// ThinkingBudget represents reasoning budget configuration.
672675
// It accepts either a string effort level or an integer token budget:
673676
// - String: "minimal", "low", "medium", "high" (for OpenAI)
677+
// - String: "adaptive" (for Anthropic models that support adaptive thinking)
674678
// - Integer: token count (for Anthropic, range 1024-32768)
675679
type ThinkingBudget struct {
676680
// Effort stores string-based reasoning effort levels
@@ -718,14 +722,50 @@ func (t ThinkingBudget) MarshalYAML() (any, error) {
718722
// NOT disabled when:
719723
// - Tokens > 0 or Tokens == -1 (explicit token budget)
720724
// - Effort is a real level like "medium" or "high"
725+
// - Effort is "adaptive"
721726
func (t *ThinkingBudget) IsDisabled() bool {
722727
if t == nil {
723728
return false
724729
}
725730
if t.Tokens == 0 && t.Effort == "" {
726731
return true
727732
}
728-
return t.Effort == "none"
733+
return strings.EqualFold(t.Effort, "none")
734+
}
735+
736+
// IsAdaptive returns true if the thinking budget is set to adaptive mode.
737+
// Adaptive thinking lets the model decide how much thinking to do.
738+
func (t *ThinkingBudget) IsAdaptive() bool {
739+
if t == nil {
740+
return false
741+
}
742+
return strings.EqualFold(t.Effort, "adaptive")
743+
}
744+
745+
// EffortTokens maps a string effort level to a token budget for providers
746+
// that only support token-based thinking (e.g. Bedrock Claude).
747+
//
748+
// The Anthropic direct API uses adaptive thinking + output_config.effort
749+
// for string levels instead; see anthropicEffort in the anthropic package.
750+
//
751+
// Returns (tokens, true) when a mapping exists, or (0, false) when
752+
// the budget uses an explicit token count or an unrecognised effort string.
753+
func (t *ThinkingBudget) EffortTokens() (int, bool) {
754+
if t == nil || t.Effort == "" {
755+
return 0, false
756+
}
757+
switch strings.ToLower(strings.TrimSpace(t.Effort)) {
758+
case "minimal":
759+
return 1024, true
760+
case "low":
761+
return 2048, true
762+
case "medium":
763+
return 8192, true
764+
case "high":
765+
return 16384, true
766+
default:
767+
return 0, false
768+
}
729769
}
730770

731771
// MarshalJSON implements custom marshaling to output simple string or int format

pkg/config/latest/types_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,77 @@ func TestThinkingBudget_MarshalUnmarshal_Zero(t *testing.T) {
121121
require.Equal(t, "thinking_budget: 0\n", string(output))
122122
}
123123

124+
func TestThinkingBudget_IsDisabled(t *testing.T) {
125+
t.Parallel()
126+
127+
for _, tt := range []struct {
128+
name string
129+
b *ThinkingBudget
130+
want bool
131+
}{
132+
{"nil", nil, false},
133+
{"zero tokens", &ThinkingBudget{Tokens: 0}, true},
134+
{"none effort", &ThinkingBudget{Effort: "none"}, true},
135+
{"positive tokens", &ThinkingBudget{Tokens: 8192}, false},
136+
{"medium effort", &ThinkingBudget{Effort: "medium"}, false},
137+
{"adaptive effort", &ThinkingBudget{Effort: "adaptive"}, false},
138+
{"negative tokens (dynamic)", &ThinkingBudget{Tokens: -1}, false},
139+
} {
140+
t.Run(tt.name, func(t *testing.T) {
141+
t.Parallel()
142+
require.Equal(t, tt.want, tt.b.IsDisabled())
143+
})
144+
}
145+
}
146+
147+
func TestThinkingBudget_IsAdaptive(t *testing.T) {
148+
t.Parallel()
149+
150+
for _, tt := range []struct {
151+
name string
152+
b *ThinkingBudget
153+
want bool
154+
}{
155+
{"nil", nil, false},
156+
{"adaptive", &ThinkingBudget{Effort: "adaptive"}, true},
157+
{"medium", &ThinkingBudget{Effort: "medium"}, false},
158+
{"tokens", &ThinkingBudget{Tokens: 8192}, false},
159+
} {
160+
t.Run(tt.name, func(t *testing.T) {
161+
t.Parallel()
162+
require.Equal(t, tt.want, tt.b.IsAdaptive())
163+
})
164+
}
165+
}
166+
167+
func TestThinkingBudget_EffortTokens(t *testing.T) {
168+
t.Parallel()
169+
170+
for _, tt := range []struct {
171+
name string
172+
b *ThinkingBudget
173+
wantTokens int
174+
wantOK bool
175+
}{
176+
{"nil", nil, 0, false},
177+
{"minimal", &ThinkingBudget{Effort: "minimal"}, 1024, true},
178+
{"low", &ThinkingBudget{Effort: "low"}, 2048, true},
179+
{"medium", &ThinkingBudget{Effort: "medium"}, 8192, true},
180+
{"high", &ThinkingBudget{Effort: "high"}, 16384, true},
181+
{"adaptive", &ThinkingBudget{Effort: "adaptive"}, 0, false},
182+
{"none", &ThinkingBudget{Effort: "none"}, 0, false},
183+
{"explicit tokens", &ThinkingBudget{Tokens: 4096}, 0, false},
184+
{"empty effort", &ThinkingBudget{}, 0, false},
185+
} {
186+
t.Run(tt.name, func(t *testing.T) {
187+
t.Parallel()
188+
tokens, ok := tt.b.EffortTokens()
189+
require.Equal(t, tt.wantOK, ok)
190+
require.Equal(t, tt.wantTokens, tokens)
191+
})
192+
}
193+
}
194+
124195
func TestAgents_UnmarshalYAML_RejectsUnknownFields(t *testing.T) {
125196
t.Parallel()
126197

pkg/model/provider/anthropic/beta_client.go

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -95,20 +95,38 @@ func (c *Client) createBetaStream(
9595
// For interleaved thinking to make sense, we use a default of 16384 tokens for the thinking budget
9696
thinkingEnabled := c.ModelOptions.Thinking() == nil || *c.ModelOptions.Thinking()
9797
if thinkingEnabled {
98-
thinkingTokens := int64(16384)
99-
if c.ModelConfig.ThinkingBudget != nil {
100-
thinkingTokens = int64(c.ModelConfig.ThinkingBudget.Tokens)
98+
if c.ModelConfig.ThinkingBudget != nil && c.ModelConfig.ThinkingBudget.IsAdaptive() {
99+
// Adaptive thinking: let the model decide how much thinking to do
100+
adaptive := anthropic.NewBetaThinkingConfigAdaptiveParam()
101+
params.Thinking = anthropic.BetaThinkingConfigParamUnion{
102+
OfAdaptive: &adaptive,
103+
}
104+
slog.Debug("Anthropic Beta API using adaptive thinking")
105+
} else if effort, ok := anthropicEffort(c.ModelConfig.ThinkingBudget); ok {
106+
// Effort level: use adaptive thinking + output_config.effort
107+
adaptive := anthropic.NewBetaThinkingConfigAdaptiveParam()
108+
params.Thinking = anthropic.BetaThinkingConfigParamUnion{
109+
OfAdaptive: &adaptive,
110+
}
111+
params.OutputConfig.Effort = anthropic.BetaOutputConfigEffort(effort)
112+
slog.Debug("Anthropic Beta API using adaptive thinking with effort",
113+
"effort", effort)
101114
} else {
102-
slog.Info("Anthropic Beta API using default thinking_budget with interleaved thinking", "budget_tokens", thinkingTokens)
103-
}
104-
switch {
105-
case thinkingTokens >= 1024 && thinkingTokens < maxTokens:
106-
params.Thinking = anthropic.BetaThinkingConfigParamOfEnabled(thinkingTokens)
107-
slog.Debug("Anthropic Beta API using thinking_budget with interleaved thinking", "budget_tokens", thinkingTokens)
108-
case thinkingTokens >= maxTokens:
109-
slog.Warn("Anthropic Beta API thinking_budget must be less than max_tokens, ignoring", "tokens", thinkingTokens, "max_tokens", maxTokens)
110-
default:
111-
slog.Warn("Anthropic Beta API thinking_budget below minimum (1024), ignoring", "tokens", thinkingTokens)
115+
thinkingTokens := int64(16384)
116+
if c.ModelConfig.ThinkingBudget != nil {
117+
thinkingTokens = int64(c.ModelConfig.ThinkingBudget.Tokens)
118+
} else {
119+
slog.Info("Anthropic Beta API using default thinking_budget with interleaved thinking", "budget_tokens", thinkingTokens)
120+
}
121+
switch {
122+
case thinkingTokens >= 1024 && thinkingTokens < maxTokens:
123+
params.Thinking = anthropic.BetaThinkingConfigParamOfEnabled(thinkingTokens)
124+
slog.Debug("Anthropic Beta API using thinking_budget with interleaved thinking", "budget_tokens", thinkingTokens)
125+
case thinkingTokens >= maxTokens:
126+
slog.Warn("Anthropic Beta API thinking_budget must be less than max_tokens, ignoring", "tokens", thinkingTokens, "max_tokens", maxTokens)
127+
default:
128+
slog.Warn("Anthropic Beta API thinking_budget below minimum (1024), ignoring", "tokens", thinkingTokens)
129+
}
112130
}
113131
} else {
114132
slog.Debug("Anthropic Beta API: Thinking disabled via /think command")

pkg/model/provider/anthropic/client.go

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,23 @@ func (c *Client) getResponseTrailer() http.Header {
5050
// adjustMaxTokensForThinking checks if max_tokens needs adjustment for thinking_budget.
5151
// Anthropic's max_tokens represents the combined budget for thinking + output tokens.
5252
// Returns the adjusted maxTokens value and an error if user-set max_tokens is too low.
53+
//
54+
// This only applies to fixed token budgets. Adaptive thinking and effort-based
55+
// budgets don't need adjustment since the model manages its own thinking allocation.
5356
func (c *Client) adjustMaxTokensForThinking(maxTokens int64) (int64, error) {
54-
if c.ModelConfig.ThinkingBudget == nil || c.ModelConfig.ThinkingBudget.Tokens <= 0 {
57+
if c.ModelConfig.ThinkingBudget == nil || c.ModelConfig.ThinkingBudget.IsAdaptive() {
58+
return maxTokens, nil
59+
}
60+
// Effort-based budgets use adaptive thinking — no token adjustment needed.
61+
if _, ok := anthropicEffort(c.ModelConfig.ThinkingBudget); ok {
5562
return maxTokens, nil
5663
}
5764

5865
thinkingTokens := int64(c.ModelConfig.ThinkingBudget.Tokens)
66+
if thinkingTokens <= 0 {
67+
return maxTokens, nil
68+
}
69+
5970
minRequired := thinkingTokens + 1024 // configured thinking budget + minimum output buffer
6071

6172
if maxTokens <= thinkingTokens {
@@ -297,7 +308,25 @@ func (c *Client) CreateChatCompletionStream(
297308

298309
// Apply thinking budget first, as it affects whether we can set temperature
299310
thinkingEnabled := false
300-
if c.ModelConfig.ThinkingBudget != nil && c.ModelConfig.ThinkingBudget.Tokens > 0 {
311+
if c.ModelConfig.ThinkingBudget != nil && c.ModelConfig.ThinkingBudget.IsAdaptive() {
312+
// Adaptive thinking: let the model decide how much thinking to do
313+
adaptive := anthropic.NewThinkingConfigAdaptiveParam()
314+
params.Thinking = anthropic.ThinkingConfigParamUnion{
315+
OfAdaptive: &adaptive,
316+
}
317+
thinkingEnabled = true
318+
slog.Debug("Anthropic API using adaptive thinking (standard messages)")
319+
} else if effort, ok := anthropicEffort(c.ModelConfig.ThinkingBudget); ok {
320+
// Effort level: use adaptive thinking + output_config.effort
321+
adaptive := anthropic.NewThinkingConfigAdaptiveParam()
322+
params.Thinking = anthropic.ThinkingConfigParamUnion{
323+
OfAdaptive: &adaptive,
324+
}
325+
params.OutputConfig.Effort = anthropic.OutputConfigEffort(effort)
326+
thinkingEnabled = true
327+
slog.Debug("Anthropic API using adaptive thinking with effort",
328+
"effort", effort)
329+
} else if c.ModelConfig.ThinkingBudget != nil && c.ModelConfig.ThinkingBudget.Tokens > 0 {
301330
thinkingTokens := int64(c.ModelConfig.ThinkingBudget.Tokens)
302331
switch {
303332
case thinkingTokens >= 1024 && thinkingTokens < maxTokens:
@@ -895,6 +924,29 @@ func differenceIDs(a, b map[string]struct{}) []string {
895924
return missing
896925
}
897926

927+
// anthropicEffort maps a ThinkingBudget effort string to an Anthropic API
928+
// effort level ("low", "medium", "high", "max"). Returns ("", false) when
929+
// the budget uses token counts, adaptive mode, or an unrecognised string.
930+
func anthropicEffort(b *latest.ThinkingBudget) (string, bool) {
931+
if b == nil {
932+
return "", false
933+
}
934+
switch strings.ToLower(strings.TrimSpace(b.Effort)) {
935+
case "low":
936+
return "low", true
937+
case "minimal": // "minimal" is not in the Anthropic API; map to closest
938+
return "low", true
939+
case "medium":
940+
return "medium", true
941+
case "high":
942+
return "high", true
943+
case "max":
944+
return "max", true
945+
default:
946+
return "", false
947+
}
948+
}
949+
898950
// anthropicContextLimit returns a reasonable default context window for Anthropic models.
899951
// We default to 200k tokens, which is what 3.5-4.5 models support; adjust as needed over time.
900952
func anthropicContextLimit(model string) int64 {

pkg/model/provider/bedrock/client.go

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -275,16 +275,23 @@ func (c *Client) buildInferenceConfig() *types.InferenceConfiguration {
275275
return cfg
276276
}
277277

278+
// resolveThinkingTokens returns the effective token budget for thinking.
279+
// It handles both explicit token counts and effort-level strings.
280+
// Returns 0 if no valid thinking budget is configured.
281+
func (c *Client) resolveThinkingTokens() int {
282+
if c.ModelConfig.ThinkingBudget == nil {
283+
return 0
284+
}
285+
if tokens, ok := c.ModelConfig.ThinkingBudget.EffortTokens(); ok {
286+
return tokens
287+
}
288+
return c.ModelConfig.ThinkingBudget.Tokens
289+
}
290+
278291
// isThinkingEnabled mirrors the validation in buildAdditionalModelRequestFields
279292
// to determine if thinking params will affect inference config (temp/topP constraints).
280293
func (c *Client) isThinkingEnabled() bool {
281-
if c.ModelConfig.ThinkingBudget == nil || c.ModelConfig.ThinkingBudget.Tokens <= 0 {
282-
return false
283-
}
284-
285-
tokens := c.ModelConfig.ThinkingBudget.Tokens
286-
287-
// Check minimum (Claude requires at least 1024 tokens for thinking)
294+
tokens := c.resolveThinkingTokens()
288295
if tokens < 1024 {
289296
return false
290297
}
@@ -310,12 +317,11 @@ func (c *Client) promptCachingEnabled() bool {
310317

311318
// buildAdditionalModelRequestFields configures Claude's extended thinking (reasoning) mode.
312319
func (c *Client) buildAdditionalModelRequestFields() document.Interface {
313-
if c.ModelConfig.ThinkingBudget == nil || c.ModelConfig.ThinkingBudget.Tokens <= 0 {
320+
tokens := c.resolveThinkingTokens()
321+
if tokens <= 0 {
314322
return nil
315323
}
316324

317-
tokens := c.ModelConfig.ThinkingBudget.Tokens
318-
319325
// Validate minimum (Claude requires at least 1024 tokens for thinking)
320326
if tokens < 1024 {
321327
slog.Warn("Bedrock thinking_budget below minimum (1024), ignoring",

0 commit comments

Comments
 (0)