chopratejas · chopratejas · May 8, 2026
diff --git a/headroom/config.py b/headroom/config.py
@@ -207,7 +207,8 @@ class AnchorConfig:
 # Tool outputs that are reference data and must NOT be compressed.
 # Read/Glob/Grep contain exact file contents/search results the agent needs for edits.
 # Write/Edit record what changes were made — compressing them causes duplicate/conflicting edits.
-# Bash is NOT excluded — its outputs (build logs, test output) are ideal compression targets.
+# Bash IS excluded by design: RTK (Rust Token Killer) handles Bash output
+# compression upstream of headroom. Compressing here would double-compress.
 DEFAULT_EXCLUDE_TOOLS: frozenset[str] = frozenset(
     {
         "Read",

diff --git a/headroom/transforms/content_router.py b/headroom/transforms/content_router.py
@@ -1243,20 +1243,34 @@ def eager_load_compressors(self) -> dict[str, str]:
         status: dict[str, str] = {}
 
         # 1. ML text compressor: Kompress
+        # Wrapper construction alone does NOT load the ONNX session — that
+        # happens lazily inside compress(). Run a tiny dummy compress() so
+        # _load_kompress() runs AND the ONNX graph is optimized at startup,
+        # not on the first real request (was costing ~3-9s of opt_ms).
         if self.config.enable_kompress:
             compressor = self._get_kompress()
             if compressor:
-                logger.info("Kompress model pre-loaded at startup")
+                try:
+                    compressor.compress("warmup " * 16)
+                    logger.info("Kompress model pre-loaded at startup")
+                except Exception as exc:
+                    logger.warning("Kompress warmup forward pass failed: %s", exc)
                 status["kompress"] = "enabled"
             else:
                 status["kompress"] = "unavailable"
 
         # 2. Magika content detector (avoids 100-200ms on first content detection)
+        # Singleton init alone defers the model's first inference cost; run a
+        # dummy identify_bytes so the predictor is fully warm.
         try:
             from ..compression.detector import _get_magika, _magika_available
 
             if _magika_available():
-                _get_magika()  # Initializes the singleton
+                _magika_inst = _get_magika()
+                try:
+                    _magika_inst.identify_bytes(b"warmup")
+                except Exception as exc:
+                    logger.debug("Magika warmup inference failed: %s", exc)
                 logger.info("Magika content detector pre-loaded at startup")
                 status["magika"] = "enabled"
             else:
@@ -1299,6 +1313,15 @@ def eager_load_compressors(self) -> dict[str, str]:
                 except Exception as e:
                     logger.debug("Tree-sitter pre-load skipped: %s", e)
                     status["tree_sitter"] = "skipped"
+                # Force one parse + compress on the most common language so
+                # the AST visitor JIT paths are hot before request traffic.
+                try:
+                    code_compressor.compress(
+                        "def warmup():\n    return 1\n",
+                        language="python",
+                    )
+                except Exception as exc:
+                    logger.debug("Code-Aware warmup compress failed: %s", exc)
             else:
                 status["code_aware"] = "not installed"