From 14fd3bd22004d0b0efb73dddb62069531ae91d63 Mon Sep 17 00:00:00 2001 From: Daniel Cox Date: Tue, 3 Mar 2026 14:10:48 +1030 Subject: [PATCH 1/2] feat: make default max_tokens configurable via DEFAULT_MAX_TOKENS env var Add DEFAULT_MAX_TOKENS environment variable (default: 2048, preserving existing behaviour) so operators can tune the fallback max_tokens without code changes. Changes: - setting.py: Add DEFAULT_MAX_TOKENS env var - schema.py: Import DEFAULT_MAX_TOKENS; use it as ChatRequest.max_tokens default - bedrock.py: Compute effective_max_tokens preferring max_completion_tokens (OpenAI newer field) over max_tokens (legacy), and use it consistently in inference_config and reasoning budget_tokens calculation The effective_max_tokens logic ensures that clients sending max_completion_tokens (the newer OpenAI field) are handled correctly, while the env var gives operators control over the default when neither field is specified by the client. --- src/api/models/bedrock.py | 19 +++++++++++-------- src/api/schema.py | 4 ++-- src/api/setting.py | 1 + 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/api/models/bedrock.py b/src/api/models/bedrock.py index 0aef7488..b7ca3ae2 100644 --- a/src/api/models/bedrock.py +++ b/src/api/models/bedrock.py @@ -774,8 +774,15 @@ def _parse_request(self, chat_request: ChatRequest) -> dict: system_prompts = self._parse_system_prompts(chat_request) # Base inference parameters. + # Prefer max_completion_tokens (OpenAI newer field) over max_tokens (legacy). + # This ensures clients sending max_completion_tokens (e.g., RooCode) are respected. + effective_max_tokens = ( + chat_request.max_completion_tokens + if chat_request.max_completion_tokens is not None + else chat_request.max_tokens + ) inference_config = { - "maxTokens": chat_request.max_tokens, + "maxTokens": effective_max_tokens, } # Only include optional parameters when specified @@ -818,15 +825,11 @@ def _parse_request(self, chat_request: ChatRequest) -> dict: if "anthropic.claude" in model_lower: # Claude format: reasoning_config = object with budget_tokens - max_tokens = ( - chat_request.max_completion_tokens - if chat_request.max_completion_tokens - else chat_request.max_tokens - ) + # effective_max_tokens already prefers max_completion_tokens over max_tokens budget_tokens = self._calc_budget_tokens( - max_tokens, chat_request.reasoning_effort + effective_max_tokens, chat_request.reasoning_effort ) - inference_config["maxTokens"] = max_tokens + inference_config["maxTokens"] = effective_max_tokens # unset topP - Not supported inference_config.pop("topP", None) diff --git a/src/api/schema.py b/src/api/schema.py index d4e39db3..ce1879a3 100644 --- a/src/api/schema.py +++ b/src/api/schema.py @@ -3,7 +3,7 @@ from pydantic import BaseModel, Field -from api.setting import DEFAULT_MODEL +from api.setting import DEFAULT_MAX_TOKENS, DEFAULT_MODEL class Model(BaseModel): @@ -106,7 +106,7 @@ class ChatRequest(BaseModel): temperature: float | None = Field(default=None, le=2.0, ge=0.0) top_p: float | None = Field(default=None, le=1.0, ge=0.0) user: str | None = None # Not used - max_tokens: int | None = 2048 + max_tokens: int | None = DEFAULT_MAX_TOKENS max_completion_tokens: int | None = None reasoning_effort: Literal["low", "medium", "high"] | None = None n: int | None = 1 # Not used diff --git a/src/api/setting.py b/src/api/setting.py index c69780b4..dc2e77e2 100644 --- a/src/api/setting.py +++ b/src/api/setting.py @@ -11,6 +11,7 @@ DEBUG = os.environ.get("DEBUG", "false").lower() != "false" AWS_REGION = os.environ.get("AWS_REGION", "us-west-2") +DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", "2048")) DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "anthropic.claude-3-sonnet-20240229-v1:0") DEFAULT_EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL", "cohere.embed-multilingual-v3") ENABLE_CROSS_REGION_INFERENCE = os.environ.get("ENABLE_CROSS_REGION_INFERENCE", "true").lower() != "false" From 7730015eaa10bcc5ead96933cd92f60c5f9ef1b1 Mon Sep 17 00:00:00 2001 From: Daniel Cox Date: Tue, 3 Mar 2026 19:54:00 +1030 Subject: [PATCH 2/2] Remove RooCode reference from max_completion_tokens comment The comment already explains the intent (prefer max_completion_tokens over max_tokens). Naming a specific client adds no value and will become stale. --- src/api/models/bedrock.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/api/models/bedrock.py b/src/api/models/bedrock.py index b7ca3ae2..7eef15fd 100644 --- a/src/api/models/bedrock.py +++ b/src/api/models/bedrock.py @@ -775,7 +775,6 @@ def _parse_request(self, chat_request: ChatRequest) -> dict: # Base inference parameters. # Prefer max_completion_tokens (OpenAI newer field) over max_tokens (legacy). - # This ensures clients sending max_completion_tokens (e.g., RooCode) are respected. effective_max_tokens = ( chat_request.max_completion_tokens if chat_request.max_completion_tokens is not None