Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 153 additions & 1 deletion application/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,12 +346,81 @@ def _check_context_limit(self, messages: List[Dict]) -> bool:
logger.error(f"Error checking context limit: {str(e)}", exc_info=True)
return False

def _validate_context_size(self, messages: List[Dict]) -> None:
"""
Pre-flight validation before calling LLM. Logs warnings but never raises errors.

Args:
messages: Messages to be sent to LLM
"""
from application.core.model_utils import get_token_limit

current_tokens = self._calculate_current_context_tokens(messages)
self.current_token_count = current_tokens
context_limit = get_token_limit(self.model_id)

percentage = (current_tokens / context_limit) * 100

# Log based on usage level
if current_tokens >= context_limit:
logger.warning(
f"Context at limit: {current_tokens:,}/{context_limit:,} tokens "
f"({percentage:.1f}%). Model: {self.model_id}"
)
elif current_tokens >= int(context_limit * settings.COMPRESSION_THRESHOLD_PERCENTAGE):
logger.info(
f"Context approaching limit: {current_tokens:,}/{context_limit:,} tokens "
f"({percentage:.1f}%)"
)

def _truncate_text_middle(self, text: str, max_tokens: int) -> str:
"""
Truncate text by removing content from the middle, preserving start and end.

Args:
text: Text to truncate
max_tokens: Maximum tokens allowed

Returns:
Truncated text with middle removed if needed
"""
from application.utils import num_tokens_from_string

current_tokens = num_tokens_from_string(text)
if current_tokens <= max_tokens:
return text

# Estimate chars per token (roughly 4 chars per token for English)
chars_per_token = len(text) / current_tokens if current_tokens > 0 else 4
target_chars = int(max_tokens * chars_per_token * 0.95) # 5% safety margin

if target_chars <= 0:
return ""

# Split: keep 40% from start, 40% from end, remove middle
start_chars = int(target_chars * 0.4)
end_chars = int(target_chars * 0.4)

truncation_marker = "\n\n[... content truncated to fit context limit ...]\n\n"

truncated = text[:start_chars] + truncation_marker + text[-end_chars:]

logger.info(
f"Truncated text from {current_tokens:,} to ~{max_tokens:,} tokens "
f"(removed middle section)"
)

return truncated

def _build_messages(
self,
system_prompt: str,
query: str,
) -> List[Dict]:
"""Build messages using pre-rendered system prompt"""
from application.core.model_utils import get_token_limit
from application.utils import num_tokens_from_string

# Append compression summary to system prompt if present
if self.compressed_summary:
compression_context = (
Expand All @@ -363,9 +432,34 @@ def _build_messages(
)
system_prompt = system_prompt + compression_context

context_limit = get_token_limit(self.model_id)
system_tokens = num_tokens_from_string(system_prompt)

# Reserve 10% for response/tools
safety_buffer = int(context_limit * 0.1)
available_after_system = context_limit - system_tokens - safety_buffer

# Max tokens for query: 80% of available space (leave room for history)
max_query_tokens = int(available_after_system * 0.8)
query_tokens = num_tokens_from_string(query)

# Truncate query from middle if it exceeds 80% of available context
if query_tokens > max_query_tokens:
query = self._truncate_text_middle(query, max_query_tokens)
query_tokens = num_tokens_from_string(query)

# Calculate remaining budget for chat history
available_for_history = max(available_after_system - query_tokens, 0)

# Truncate chat history to fit within available budget
working_history = self._truncate_history_to_fit(
self.chat_history,
available_for_history,
)

messages = [{"role": "system", "content": system_prompt}]

for i in self.chat_history:
for i in working_history:
if "prompt" in i and "response" in i:
messages.append({"role": "user", "content": i["prompt"]})
messages.append({"role": "assistant", "content": i["response"]})
Expand Down Expand Up @@ -397,7 +491,65 @@ def _build_messages(
messages.append({"role": "user", "content": query})
return messages

def _truncate_history_to_fit(
self,
history: List[Dict],
max_tokens: int,
) -> List[Dict]:
"""
Truncate chat history to fit within token budget, keeping most recent messages.

Args:
history: Full chat history
max_tokens: Maximum tokens allowed for history

Returns:
Truncated history (most recent messages that fit)
"""
from application.utils import num_tokens_from_string

if not history or max_tokens <= 0:
return []

truncated = []
current_tokens = 0

# Iterate from newest to oldest
for message in reversed(history):
message_tokens = 0

if "prompt" in message and "response" in message:
message_tokens += num_tokens_from_string(message["prompt"])
message_tokens += num_tokens_from_string(message["response"])

if "tool_calls" in message:
for tool_call in message["tool_calls"]:
tool_str = (
f"Tool: {tool_call.get('tool_name')} | "
f"Action: {tool_call.get('action_name')} | "
f"Args: {tool_call.get('arguments')} | "
f"Response: {tool_call.get('result')}"
)
message_tokens += num_tokens_from_string(tool_str)

if current_tokens + message_tokens <= max_tokens:
current_tokens += message_tokens
truncated.insert(0, message) # Maintain chronological order
else:
break

if len(truncated) < len(history):
logger.info(
f"Truncated chat history from {len(history)} to {len(truncated)} messages "
f"to fit within {max_tokens:,} token budget"
)

return truncated

def _llm_gen(self, messages: List[Dict], log_context: Optional[LogContext] = None):
# Pre-flight context validation - fail fast if over limit
self._validate_context_size(messages)

gen_kwargs = {"model": self.model_id, "messages": messages}

if (
Expand Down
4 changes: 4 additions & 0 deletions docs/pages/Guides/_meta.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
"title": "🏗️ Architecture",
"href": "/Guides/Architecture"
},
"compression": {
"title": "🗜️ Context Compression",
"href": "/Guides/compression"
},
"Integrations": {
"title": "🔗 Integrations"
}
Expand Down
37 changes: 37 additions & 0 deletions docs/pages/Guides/compression.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Context Compression

DocsGPT implements a smart context compression system to manage long conversations effectively. This feature prevents conversations from hitting the LLM's context window limit while preserving critical information and continuity.

Check warning on line 3 in docs/pages/Guides/compression.md

View workflow job for this annotation

GitHub Actions / vale

[vale] reported by reviewdog 🐶 [DocsGPT.Spelling] Did you really mean 'LLM's'? Raw Output: {"message": "[DocsGPT.Spelling] Did you really mean 'LLM's'?", "location": {"path": "docs/pages/Guides/compression.md", "range": {"start": {"line": 3, "column": 150}}}, "severity": "WARNING"}

## How It Works

The compression system operates on a "summarize and truncate" principle:

1. **Threshold Check**: Before each request, the system calculates the total token count of the conversation history.
2. **Trigger**: If the token count exceeds a configured threshold (default: 80% of the model's context limit), compression is triggered.
3. **Summarization**: An LLM (potentially a different, cheaper/faster one) processes the older part of the conversation—including previous summaries, user messages, agent responses, and tool outputs.
4. **Context Replacement**: The system generates a comprehensive summary of the older history. For subsequent requests, the LLM receives this **Summary + Recent Messages** instead of the full raw history.

### Key Features

* **Recursive Summarization**: New summaries incorporate previous summaries, ensuring that information from the very beginning of a long chat is not lost.
* **Tool Call Support**: The compression logic explicitly handles tool calls and their outputs (e.g., file readings, search results), summarizing their results so the agent retains knowledge of what it has already done.
* **"Needle in a Haystack" Preservation**: The prompts are designed to identify and preserve specific, critical details (like passwords, keys, or specific user instructions) even when compressing large amounts of text.

## Configuration

You can configure the compression behavior in your `.env` file or `application/core/settings.py`:

| Setting | Default | Description |
| :--- | :--- | :--- |
| `ENABLE_CONVERSATION_COMPRESSION` | `True` | Master switch to enable/disable the feature. |
| `COMPRESSION_THRESHOLD_PERCENTAGE` | `0.8` | The fraction of the context window (0.0 to 1.0) that triggers compression. |
| `COMPRESSION_MODEL_OVERRIDE` | `None` | (Optional) Specify a different model ID to use specifically for the summarization task (e.g., using `gpt-3.5-turbo` to compress for `gpt-4`). |
| `COMPRESSION_MAX_HISTORY_POINTS` | `3` | The number of past compression points to keep in the database (older ones are discarded as they are incorporated into newer summaries). |

## Architecture

The system is modularized into several components:

* **`CompressionThresholdChecker`**: Calculates token usage and decides when to compress.
* **`CompressionService`**: Orchestrates the compression process, manages DB updates, and reconstructs the context (Summary + Recent Messages) for the LLM.
* **`CompressionPromptBuilder`**: Constructs the specific prompts used to instruct the LLM to summarize the conversation effectively.
Loading