Skip to content

Commit 2a0040f

Browse files
feat(agent): token-aware history truncation alongside message cap
Add `agents.context_window_tokens` field (default 50000) and a new `truncate_by_token_budget` helper that bounds in-context history by both estimated token cost (primary) and message count (safety cap), preserving assistant.tool_calls ↔ role=tool pairs intact via the same walker as truncate_by_message_count. Why: message-count alone is a wildly variable proxy for token cost — one 50KB tool result eats more context than 100 short user messages. Token budget gives predictable behavior across heterogeneous traffic; message cap remains as a safety net against pathological tiny-message floods. Changes: - models/agent.py: + context_window_tokens (Integer, default=50000) + DEFAULT_CONTEXT_WINDOW_TOKENS constant - schemas/schemas.py: AgentOut, AgentUpdate (1000 <= tokens <= 500000) - alembic: add_context_window_tokens.py (idempotent IF NOT EXISTS) - services/history_window.py: + truncate_by_token_budget, refactored common walker, JSON-serialized char->token estimate via existing estimate_tokens_from_chars (chars/3 — overestimates safely) - api/websocket.py: pass tok_budget to helper, raise DB load to max(ctx_size, 500) so helper has room to choose - api/feishu.py: same pattern at 2 sites (web chat + IM channel paths) - frontend: AgentDetail Settings slider + i18n + types 10 new tests covering token-budget mode (huge-message dropped, both-bounds interaction, atomic pair preservation, orphan defense). 25/25 pass. Other channels (dingtalk/discord/slack/teams/wecom/whatsapp) still use DB-level message-count limit only — they don't get token awareness in this PR but won't crash. Migrating them is follow-up scope. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 114be24 commit 2a0040f

11 files changed

Lines changed: 442 additions & 65 deletions

File tree

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
"""add agents.context_window_tokens for token-aware history truncation
2+
3+
Revision ID: add_context_window_tokens
4+
Revises: rm_agent_credential_secrets
5+
Create Date: 2026-04-27
6+
"""
7+
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
12+
13+
# revision identifiers, used by Alembic.
14+
revision: str = "add_context_window_tokens"
15+
down_revision: Union[str, Sequence[str], None] = "rm_agent_credential_secrets"
16+
branch_labels: Union[str, Sequence[str], None] = None
17+
depends_on: Union[str, Sequence[str], None] = None
18+
19+
20+
def upgrade() -> None:
21+
"""Add context_window_tokens with a DDL default of 50000.
22+
23+
The four-step pattern is required because earlier in the migration chain,
24+
``alembic/versions/0000_initial_schema.py`` calls
25+
``Base.metadata.create_all(checkfirst=True)``, which creates ``agents``
26+
from the *current* model state — including any new columns. SQLAlchemy's
27+
Python-side ``default=`` does NOT translate to a DDL ``DEFAULT`` clause,
28+
so the column ends up ``NOT NULL`` with no default, and a naive
29+
``ADD COLUMN IF NOT EXISTS ... DEFAULT 50000`` short-circuits and never
30+
sets the default.
31+
32+
This four-step approach is idempotent regardless of pre-existing state:
33+
- column missing → created (nullable, no default initially)
34+
- column present without default → default set
35+
- any rows with NULL → backfilled to 50000
36+
- column made NOT NULL
37+
38+
Re-runnable: ALTER SET DEFAULT to the same value is a no-op; UPDATE
39+
affecting 0 rows is a no-op; ALTER SET NOT NULL on an already-NOT-NULL
40+
column is a no-op.
41+
"""
42+
# 1. Add the column if missing — do NOT specify NOT NULL or DEFAULT here,
43+
# so existing rows (if any from create_all) aren't blocked.
44+
op.execute(
45+
"ALTER TABLE agents ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER"
46+
)
47+
# 2. Ensure the DDL default is set so future inserts that omit this
48+
# column (raw SQL, restored backups, manual migrations) get 50000.
49+
op.execute(
50+
"ALTER TABLE agents ALTER COLUMN context_window_tokens SET DEFAULT 50000"
51+
)
52+
# 3. Backfill any rows that were created before the default landed.
53+
op.execute(
54+
"UPDATE agents SET context_window_tokens = 50000 "
55+
"WHERE context_window_tokens IS NULL"
56+
)
57+
# 4. Now safe to enforce NOT NULL.
58+
op.execute(
59+
"ALTER TABLE agents ALTER COLUMN context_window_tokens SET NOT NULL"
60+
)
61+
62+
63+
def downgrade() -> None:
64+
# Downgrade omitted — dropping the column would lose per-tenant tuning.
65+
pass

backend/app/api/feishu.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from app.models.identity import IdentityProvider
1919
from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
2020
from app.services.feishu_service import feishu_service
21-
from app.services.history_window import truncate_by_message_count
21+
from app.services.history_window import truncate_by_token_budget
2222

2323
router = APIRouter(tags=["feishu"])
2424

@@ -657,11 +657,12 @@ async def process_feishu_event(agent_id: uuid.UUID, body: dict, db: AsyncSession
657657
)
658658
_pre_sess = _pre_sess_r.scalar_one_or_none()
659659
_history_conv_id = str(_pre_sess.id) if _pre_sess else conv_id
660+
# Load extra raw material so app-level token-aware helper has room to choose
660661
history_result = await db.execute(
661662
select(ChatMessage)
662663
.where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == _history_conv_id)
663664
.order_by(ChatMessage.created_at.desc())
664-
.limit(ctx_size)
665+
.limit(max(ctx_size, 500))
665666
)
666667
history_msgs = history_result.scalars().all()
667668
history = _build_llm_history_from_chat_messages(list(reversed(history_msgs)))
@@ -1374,11 +1375,12 @@ async def _handle_feishu_file(
13741375
# Load conversation history for LLM context
13751376
from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
13761377
ctx_size = (agent_obj.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE) if agent_obj else DEFAULT_CONTEXT_WINDOW_SIZE
1378+
# Load extra raw material so app-level token-aware helper has room to choose
13771379
_hist_r = await db.execute(
13781380
_select(ChatMessage)
13791381
.where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == session_conv_id)
13801382
.order_by(ChatMessage.created_at.desc())
1381-
.limit(ctx_size)
1383+
.limit(max(ctx_size, 500))
13821384
)
13831385
_history = _build_llm_history_from_chat_messages(list(reversed(_hist_r.scalars().all())))
13841386

@@ -1632,15 +1634,18 @@ async def _call_agent_llm(
16321634

16331635
# Build conversation messages (without system prompt — call_llm adds it)
16341636
messages: list[dict] = []
1635-
from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
1637+
from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE, DEFAULT_CONTEXT_WINDOW_TOKENS
16361638
ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
1639+
tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
16371640
if history:
1638-
# Pair-aware truncation preserves any future assistant.tool_calls ↔ role=tool
1639-
# pairs intact. Today _normalize_history_messages drops DB role="tool_call"
1640-
# rows, so this path has no tool messages and the helper acts as plain count
1641-
# truncation; the safety kicks in once a feishu reorganization helper exists.
1641+
# Pair-aware truncation: token budget primary, message count as safety cap.
1642+
# Today _normalize_history_messages drops DB role="tool_call" rows, so this
1643+
# path has no tool messages and the pair guard is a no-op; the safety kicks
1644+
# in once a feishu reorganization helper exists.
16421645
messages.extend(
1643-
truncate_by_message_count(_normalize_history_messages(history), ctx_size)
1646+
truncate_by_token_budget(
1647+
_normalize_history_messages(history), tok_budget, message_cap=ctx_size,
1648+
)
16441649
)
16451650
messages.append({"role": "user", "content": user_text})
16461651

backend/app/api/websocket.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from app.models.llm import LLMModel
2020
from app.models.user import User
2121
from app.services.chat_session_service import ensure_primary_platform_session
22-
from app.services.history_window import truncate_by_message_count
22+
from app.services.history_window import truncate_by_token_budget
2323
from app.services.llm import call_llm, call_llm_with_failover
2424

2525
router = APIRouter(tags=["websocket"])
@@ -214,7 +214,9 @@ async def websocket_chat(
214214
role_description = agent.role_description or ""
215215
welcome_message = agent.welcome_message or ""
216216
ctx_size = agent.context_window_size or 100
217-
logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}")
217+
from app.models.agent import DEFAULT_CONTEXT_WINDOW_TOKENS
218+
tok_budget = getattr(agent, "context_window_tokens", None) or DEFAULT_CONTEXT_WINDOW_TOKENS
219+
logger.info(f"[WS] Agent: {agent_name}, type: {agent_type}, model_id: {agent.primary_model_id}, ctx: {ctx_size}msg/{tok_budget}tok")
218220

219221
# Load the agent's primary model
220222
if agent.primary_model_id:
@@ -300,11 +302,14 @@ async def websocket_chat(
300302
logger.info(f"[WS] Selected primary session {conv_id}")
301303

302304
try:
305+
# Load extra raw material so the app-level token-aware helper
306+
# (truncate_by_token_budget below) has room to choose from.
307+
_db_load_cap = max(ctx_size, 500)
303308
history_result = await db.execute(
304309
select(ChatMessage)
305310
.where(ChatMessage.agent_id == agent_id, ChatMessage.conversation_id == conv_id)
306311
.order_by(ChatMessage.created_at.desc())
307-
.limit(ctx_size)
312+
.limit(_db_load_cap)
308313
)
309314
history_messages = list(reversed(history_result.scalars().all()))
310315
logger.info(f"[WS] Loaded {len(history_messages)} history messages for session {conv_id}")
@@ -663,12 +668,30 @@ async def _call_with_failover():
663668
async def _on_failover(reason: str):
664669
await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
665670

666-
# Pair-aware truncation: keep the last `ctx_size` messages while
667-
# preserving assistant.tool_calls ↔ role=tool blocks atomically.
668-
# Naive [-ctx_size:] slicing can leave orphan tool messages at the
669-
# head when the cut lands mid-pair, which OpenAI rejects with
670-
# "No tool call found for function call output" (issue #446).
671-
_truncated = truncate_by_message_count(conversation, ctx_size)
671+
# Pair-aware truncation with a token budget plus a message-count
672+
# safety cap. Either bound stops the walk; pairs (assistant.tool_calls
673+
# ↔ role=tool) are kept atomic. Token budget protects against
674+
# one-tool-result-eats-the-window scenarios; message cap protects
675+
# against pathological tiny-message floods. The pair guard fixes
676+
# the orphan-tool failure mode reported in #446.
677+
#
678+
# The current user message (just appended at line ~416) is excluded
679+
# from truncation and re-appended after — otherwise a single huge
680+
# input (large paste, base64 image_data) could push past the budget
681+
# and cause the helper to drop the very message we're answering.
682+
# If the input itself exceeds the model's context, the provider will
683+
# surface a clear error rather than silently dropping it here.
684+
_current = (
685+
conversation[-1]
686+
if conversation and conversation[-1].get("role") == "user"
687+
else None
688+
)
689+
_history = conversation[:-1] if _current is not None else conversation
690+
_truncated = truncate_by_token_budget(
691+
_history, tok_budget, message_cap=ctx_size,
692+
)
693+
if _current is not None:
694+
_truncated.append(_current)
672695

673696
return await call_llm_with_failover(
674697
primary_model=llm_model,

backend/app/models/agent.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
# (see: https://github.com/dataelement/Clawith/issues/238).
1616
DEFAULT_CONTEXT_WINDOW_SIZE = 100
1717

18+
# Default token budget for in-context history. Conservative for 128K-context
19+
# models after system prompt + soul/memory injection (~5-15K tokens). Per-agent
20+
# override via Agent.context_window_tokens.
21+
DEFAULT_CONTEXT_WINDOW_TOKENS = 50000
22+
1823

1924
class Agent(Base):
2025
"""Digital employee (Agent) instance.
@@ -81,6 +86,24 @@ class Agent(Base):
8186
last_monthly_reset: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
8287
tokens_used_total: Mapped[int] = mapped_column(Integer, default=0)
8388
context_window_size: Mapped[int] = mapped_column(Integer, default=100)
89+
# Token-aware secondary bound on history sent to the LLM. Truncation uses
90+
# the smaller of context_window_size (message count) and this token budget,
91+
# preserving assistant.tool_calls ↔ role=tool pairs intact.
92+
#
93+
# ``server_default`` matters: alembic/versions/0000_initial_schema.py uses
94+
# ``Base.metadata.create_all`` which reads model state at runtime. Without
95+
# a server_default, fresh-DB bootstrap would create this column NOT NULL
96+
# without a DDL DEFAULT — and the ``ADD COLUMN IF NOT EXISTS`` migration
97+
# later in the chain would short-circuit, leaving direct-SQL inserts
98+
# broken. ``server_default="50000"`` ensures the DDL has the default
99+
# whether the column was created by create_all or by the explicit
100+
# migration.
101+
context_window_tokens: Mapped[int] = mapped_column(
102+
Integer,
103+
default=DEFAULT_CONTEXT_WINDOW_TOKENS,
104+
server_default=str(DEFAULT_CONTEXT_WINDOW_TOKENS),
105+
nullable=False,
106+
)
84107
max_tool_rounds: Mapped[int] = mapped_column(Integer, default=50)
85108

86109
# Trigger limits (per-agent, configurable from Settings UI)

backend/app/schemas/schemas.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ class AgentOut(BaseModel):
251251
max_tokens_per_day: int | None = None
252252
max_tokens_per_month: int | None = None
253253
context_window_size: int = 100
254+
context_window_tokens: int = 50000
254255
max_tool_rounds: int = 50
255256
max_triggers: int = 20
256257
min_poll_interval_min: int = 5
@@ -286,6 +287,7 @@ class AgentUpdate(BaseModel):
286287
primary_model_id: uuid.UUID | None = None
287288
fallback_model_id: uuid.UUID | None = None
288289
context_window_size: int | None = Field(default=None, ge=1, le=500)
290+
context_window_tokens: int | None = Field(default=None, ge=1000, le=500000)
289291
max_tokens_per_day: int | None = None
290292
max_tokens_per_month: int | None = None
291293
max_tool_rounds: int | None = None

0 commit comments

Comments
 (0)