diff --git a/src/api/models/bedrock.py b/src/api/models/bedrock.py index 0aef7488..7eef15fd 100644 --- a/src/api/models/bedrock.py +++ b/src/api/models/bedrock.py @@ -774,8 +774,14 @@ def _parse_request(self, chat_request: ChatRequest) -> dict: system_prompts = self._parse_system_prompts(chat_request) # Base inference parameters. + # Prefer max_completion_tokens (OpenAI newer field) over max_tokens (legacy). + effective_max_tokens = ( + chat_request.max_completion_tokens + if chat_request.max_completion_tokens is not None + else chat_request.max_tokens + ) inference_config = { - "maxTokens": chat_request.max_tokens, + "maxTokens": effective_max_tokens, } # Only include optional parameters when specified @@ -818,15 +824,11 @@ def _parse_request(self, chat_request: ChatRequest) -> dict: if "anthropic.claude" in model_lower: # Claude format: reasoning_config = object with budget_tokens - max_tokens = ( - chat_request.max_completion_tokens - if chat_request.max_completion_tokens - else chat_request.max_tokens - ) + # effective_max_tokens already prefers max_completion_tokens over max_tokens budget_tokens = self._calc_budget_tokens( - max_tokens, chat_request.reasoning_effort + effective_max_tokens, chat_request.reasoning_effort ) - inference_config["maxTokens"] = max_tokens + inference_config["maxTokens"] = effective_max_tokens # unset topP - Not supported inference_config.pop("topP", None) diff --git a/src/api/schema.py b/src/api/schema.py index d4e39db3..ce1879a3 100644 --- a/src/api/schema.py +++ b/src/api/schema.py @@ -3,7 +3,7 @@ from pydantic import BaseModel, Field -from api.setting import DEFAULT_MODEL +from api.setting import DEFAULT_MAX_TOKENS, DEFAULT_MODEL class Model(BaseModel): @@ -106,7 +106,7 @@ class ChatRequest(BaseModel): temperature: float | None = Field(default=None, le=2.0, ge=0.0) top_p: float | None = Field(default=None, le=1.0, ge=0.0) user: str | None = None # Not used - max_tokens: int | None = 2048 + max_tokens: int | None = DEFAULT_MAX_TOKENS max_completion_tokens: int | None = None reasoning_effort: Literal["low", "medium", "high"] | None = None n: int | None = 1 # Not used diff --git a/src/api/setting.py b/src/api/setting.py index c69780b4..dc2e77e2 100644 --- a/src/api/setting.py +++ b/src/api/setting.py @@ -11,6 +11,7 @@ DEBUG = os.environ.get("DEBUG", "false").lower() != "false" AWS_REGION = os.environ.get("AWS_REGION", "us-west-2") +DEFAULT_MAX_TOKENS = int(os.environ.get("DEFAULT_MAX_TOKENS", "2048")) DEFAULT_MODEL = os.environ.get("DEFAULT_MODEL", "anthropic.claude-3-sonnet-20240229-v1:0") DEFAULT_EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL", "cohere.embed-multilingual-v3") ENABLE_CROSS_REGION_INFERENCE = os.environ.get("ENABLE_CROSS_REGION_INFERENCE", "true").lower() != "false"