|
19 | 19 | from app.models.llm import LLMModel |
20 | 20 | from app.models.user import User |
21 | 21 | from app.services.chat_session_service import ensure_primary_platform_session |
22 | | -from app.services.history_window import truncate_by_message_count |
| 22 | +from app.services.history_window import truncate_by_token_budget |
23 | 23 | from app.services.llm import call_llm, call_llm_with_failover |
24 | 24 |
|
25 | 25 | router = APIRouter(tags=["websocket"]) |
@@ -214,7 +214,9 @@ async def websocket_chat( |
214 | 214 | role_description = agent.role_description or "" |
215 | 215 | welcome_message = agent.welcome_message or "" |
216 | 216 | ctx_size = agent.context_window_size or 100 |
217 | | - logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}") |
| 217 | + from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS |
| 218 | + tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS |
| 219 | + logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok") |
218 | 220 |
|
219 | 221 | # Load the agent's primary model |
220 | 222 | if agent.primary_model_id: |
@@ -300,11 +302,14 @@ async def websocket_chat( |
300 | 302 | logger.info(f"[WS] Selected primary session {conv_id}") |
301 | 303 |
|
302 | 304 | try: |
| 305 | + # Load extra raw material so the app-level token-aware helper |
| 306 | + # (truncate_by_token_budget below) has room to choose from. |
| 307 | + _db_load_cap = max(ctx_size, 500) |
303 | 308 | history_result = await db.execute( |
304 | 309 | select(ChatMessage) |
305 | 310 | .where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id) |
306 | 311 | .order_by(ChatMessage.created_at.desc()) |
307 | | - .limit(ctx_size) |
| 312 | + .limit(_db_load_cap) |
308 | 313 | ) |
309 | 314 | history_messages = list(reversed(history_result.scalars().all())) |
310 | 315 | logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}") |
@@ -663,12 +668,30 @@ async def _call_with_failover(): |
663 | 668 | async def _on_failover(reason: str): |
664 | 669 | await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"}) |
665 | 670 |
|
666 | | - # Pair-aware truncation: keep the last `ctx_size` messages while |
667 | | - # preserving assistant.tool_calls ↔ role=tool blocks atomically. |
668 | | - # Naive [-ctx_size:] slicing can leave orphan tool messages at the |
669 | | - # head when the cut lands mid-pair, which OpenAI rejects with |
670 | | - # "No tool call found for function call output" (issue #446). |
671 | | - _truncated = truncate_by_message_count(conversation, ctx_size) |
| 671 | + # Pair-aware truncation with a token budget plus a message-count |
| 672 | + # safety cap. Either bound stops the walk; pairs (assistant.tool_calls |
| 673 | + # ↔ role=tool) are kept atomic. Token budget protects against |
| 674 | + # one-tool-result-eats-the-window scenarios; message cap protects |
| 675 | + # against pathological tiny-message floods. The pair guard fixes |
| 676 | + # the orphan-tool failure mode reported in #446. |
| 677 | + # |
| 678 | + # The current user message (just appended at line ~416) is excluded |
| 679 | + # from truncation and re-appended after — otherwise a single huge |
| 680 | + # input (large paste, base64 image_data) could push past the budget |
| 681 | + # and cause the helper to drop the very message we're answering. |
| 682 | + # If the input itself exceeds the model's context, the provider will |
| 683 | + # surface a clear error rather than silently dropping it here. |
| 684 | + _current = ( |
| 685 | + conversation[-1] |
| 686 | + if conversation and conversation[-1].get("role") == "user" |
| 687 | + else None |
| 688 | + ) |
| 689 | + _history = conversation[:-1] if _current is not None else conversation |
| 690 | + _truncated = truncate_by_token_budget( |
| 691 | + _history, tok_budget, message_cap=ctx_size, |
| 692 | + ) |
| 693 | + if _current is not None: |
| 694 | + _truncated.append(_current) |
672 | 695 |
|
673 | 696 | return await call_llm_with_failover( |
674 | 697 | primary_model=llm_model, |
|
0 commit comments