arc53 · dartpain · Jan 5, 2026 · Jan 5, 2026
diff --git a/application/agents/base.py b/application/agents/base.py
@@ -346,12 +346,81 @@ def _check_context_limit(self, messages: List[Dict]) -> bool:
             logger.error(f"Error checking context limit: {str(e)}", exc_info=True)
             return False
 
+    def _validate_context_size(self, messages: List[Dict]) -> None:
+        """
+        Pre-flight validation before calling LLM. Logs warnings but never raises errors.
+
+        Args:
+            messages: Messages to be sent to LLM
+        """
+        from application.core.model_utils import get_token_limit
+
+        current_tokens = self._calculate_current_context_tokens(messages)
+        self.current_token_count = current_tokens
+        context_limit = get_token_limit(self.model_id)
+
+        percentage = (current_tokens / context_limit) * 100
+
+        # Log based on usage level
+        if current_tokens >= context_limit:
+            logger.warning(
+                f"Context at limit: {current_tokens:,}/{context_limit:,} tokens "
+                f"({percentage:.1f}%). Model: {self.model_id}"
+            )
+        elif current_tokens >= int(context_limit * settings.COMPRESSION_THRESHOLD_PERCENTAGE):
+            logger.info(
+                f"Context approaching limit: {current_tokens:,}/{context_limit:,} tokens "
+                f"({percentage:.1f}%)"
+            )
+
+    def _truncate_text_middle(self, text: str, max_tokens: int) -> str:
+        """
+        Truncate text by removing content from the middle, preserving start and end.
+
+        Args:
+            text: Text to truncate
+            max_tokens: Maximum tokens allowed
+
+        Returns:
+            Truncated text with middle removed if needed
+        """
+        from application.utils import num_tokens_from_string
+
+        current_tokens = num_tokens_from_string(text)
+        if current_tokens <= max_tokens:
+            return text
+
+        # Estimate chars per token (roughly 4 chars per token for English)
+        chars_per_token = len(text) / current_tokens if current_tokens > 0 else 4
+        target_chars = int(max_tokens * chars_per_token * 0.95)  # 5% safety margin
+
+        if target_chars <= 0:
+            return ""
+
+        # Split: keep 40% from start, 40% from end, remove middle
+        start_chars = int(target_chars * 0.4)
+        end_chars = int(target_chars * 0.4)
+
+        truncation_marker = "\n\n[... content truncated to fit context limit ...]\n\n"
+
+        truncated = text[:start_chars] + truncation_marker + text[-end_chars:]
+
+        logger.info(
+            f"Truncated text from {current_tokens:,} to ~{max_tokens:,} tokens "
+            f"(removed middle section)"
+        )
+
+        return truncated
+
     def _build_messages(
         self,
         system_prompt: str,
         query: str,
     ) -> List[Dict]:
         """Build messages using pre-rendered system prompt"""
+        from application.core.model_utils import get_token_limit
+        from application.utils import num_tokens_from_string
+
         # Append compression summary to system prompt if present
         if self.compressed_summary:
             compression_context = (
@@ -363,9 +432,34 @@ def _build_messages(
             )
             system_prompt = system_prompt + compression_context
 
+        context_limit = get_token_limit(self.model_id)
+        system_tokens = num_tokens_from_string(system_prompt)
+
+        # Reserve 10% for response/tools
+        safety_buffer = int(context_limit * 0.1)
+        available_after_system = context_limit - system_tokens - safety_buffer
+
+        # Max tokens for query: 80% of available space (leave room for history)
+        max_query_tokens = int(available_after_system * 0.8)
+        query_tokens = num_tokens_from_string(query)
+
+        # Truncate query from middle if it exceeds 80% of available context
+        if query_tokens > max_query_tokens:
+            query = self._truncate_text_middle(query, max_query_tokens)
+            query_tokens = num_tokens_from_string(query)
+
+        # Calculate remaining budget for chat history
+        available_for_history = max(available_after_system - query_tokens, 0)
+
+        # Truncate chat history to fit within available budget
+        working_history = self._truncate_history_to_fit(
+            self.chat_history,
+            available_for_history,
+        )
+
         messages = [{"role": "system", "content": system_prompt}]
 
-        for i in self.chat_history:
+        for i in working_history:
             if "prompt" in i and "response" in i:
                 messages.append({"role": "user", "content": i["prompt"]})
                 messages.append({"role": "assistant", "content": i["response"]})
@@ -397,7 +491,65 @@ def _build_messages(
         messages.append({"role": "user", "content": query})
         return messages
 
+    def _truncate_history_to_fit(
+        self,
+        history: List[Dict],
+        max_tokens: int,
+    ) -> List[Dict]:
+        """
+        Truncate chat history to fit within token budget, keeping most recent messages.
+
+        Args:
+            history: Full chat history
+            max_tokens: Maximum tokens allowed for history
+
+        Returns:
+            Truncated history (most recent messages that fit)
+        """
+        from application.utils import num_tokens_from_string
+
+        if not history or max_tokens <= 0:
+            return []
+
+        truncated = []
+        current_tokens = 0
+
+        # Iterate from newest to oldest
+        for message in reversed(history):
+            message_tokens = 0
+
+            if "prompt" in message and "response" in message:
+                message_tokens += num_tokens_from_string(message["prompt"])
+                message_tokens += num_tokens_from_string(message["response"])
+
+            if "tool_calls" in message:
+                for tool_call in message["tool_calls"]:
+                    tool_str = (
+                        f"Tool: {tool_call.get('tool_name')} | "
+                        f"Action: {tool_call.get('action_name')} | "
+                        f"Args: {tool_call.get('arguments')} | "
+                        f"Response: {tool_call.get('result')}"
+                    )
+                    message_tokens += num_tokens_from_string(tool_str)
+
+            if current_tokens + message_tokens <= max_tokens:
+                current_tokens += message_tokens
+                truncated.insert(0, message)  # Maintain chronological order
+            else:
+                break
+
+        if len(truncated) < len(history):
+            logger.info(
+                f"Truncated chat history from {len(history)} to {len(truncated)} messages "
+                f"to fit within {max_tokens:,} token budget"
+            )
+
+        return truncated
+
     def _llm_gen(self, messages: List[Dict], log_context: Optional[LogContext] = None):
+        # Pre-flight context validation - fail fast if over limit
+        self._validate_context_size(messages)
+
         gen_kwargs = {"model": self.model_id, "messages": messages}
 
         if (

diff --git a/docs/pages/Guides/_meta.json b/docs/pages/Guides/_meta.json
@@ -21,6 +21,10 @@
     "title": "🏗️ Architecture",
     "href": "/Guides/Architecture"
   },
+  "compression": {
+    "title": "🗜️ Context Compression",
+    "href": "/Guides/compression"
+  },
   "Integrations": {
     "title": "🔗 Integrations"
   }

diff --git a/docs/pages/Guides/compression.md b/docs/pages/Guides/compression.md
@@ -0,0 +1,37 @@
+# Context Compression
+
+DocsGPT implements a smart context compression system to manage long conversations effectively. This feature prevents conversations from hitting the LLM's context window limit while preserving critical information and continuity.
+
+## How It Works
+
+The compression system operates on a "summarize and truncate" principle:
+
+1.  **Threshold Check**: Before each request, the system calculates the total token count of the conversation history.
+2.  **Trigger**: If the token count exceeds a configured threshold (default: 80% of the model's context limit), compression is triggered.
+3.  **Summarization**: An LLM (potentially a different, cheaper/faster one) processes the older part of the conversation—including previous summaries, user messages, agent responses, and tool outputs.
+4.  **Context Replacement**: The system generates a comprehensive summary of the older history. For subsequent requests, the LLM receives this **Summary + Recent Messages** instead of the full raw history.
+
+### Key Features
+
+*   **Recursive Summarization**: New summaries incorporate previous summaries, ensuring that information from the very beginning of a long chat is not lost.
+*   **Tool Call Support**: The compression logic explicitly handles tool calls and their outputs (e.g., file readings, search results), summarizing their results so the agent retains knowledge of what it has already done.
+*   **"Needle in a Haystack" Preservation**: The prompts are designed to identify and preserve specific, critical details (like passwords, keys, or specific user instructions) even when compressing large amounts of text.
+
+## Configuration
+
+You can configure the compression behavior in your `.env` file or `application/core/settings.py`:
+
+| Setting | Default | Description |
+| :--- | :--- | :--- |
+| `ENABLE_CONVERSATION_COMPRESSION` | `True` | Master switch to enable/disable the feature. |
+| `COMPRESSION_THRESHOLD_PERCENTAGE` | `0.8` | The fraction of the context window (0.0 to 1.0) that triggers compression. |
+| `COMPRESSION_MODEL_OVERRIDE` | `None` | (Optional) Specify a different model ID to use specifically for the summarization task (e.g., using `gpt-3.5-turbo` to compress for `gpt-4`). |
+| `COMPRESSION_MAX_HISTORY_POINTS` | `3` | The number of past compression points to keep in the database (older ones are discarded as they are incorporated into newer summaries). |
+
+## Architecture
+
+The system is modularized into several components:
+
+*   **`CompressionThresholdChecker`**: Calculates token usage and decides when to compress.
+*   **`CompressionService`**: Orchestrates the compression process, manages DB updates, and reconstructs the context (Summary + Recent Messages) for the LLM.
+*   **`CompressionPromptBuilder`**: Constructs the specific prompts used to instruct the LLM to summarize the conversation effectively.