feat: Phase 1 — Transform Pipeline + SnapBack reversible compression

aeromomo · claude · aeromomo · commit b2687b71e8c9 · 2026-03-17T18:34:08.000-07:00
- Transform base class + Pipeline engine (ordered chain, immutable context)
- SnapBack store (LRU + TTL), marker embed/extract, provider-agnostic tool defs
- 107 new tests (43 pipeline + 64 snapback), 1026 total passing

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/EVOLUTION_PLAN.md b/EVOLUTION_PLAN.md
diff --git a/scripts/lib/snapback/__init__.py b/scripts/lib/snapback/__init__.py
@@ -0,0 +1,16 @@
+"""SnapBack reversible compression engine for Claw Compactor v7.0.
+
+Part of claw-compactor. License: MIT.
+"""
+from .store import SnapBackStore
+from .marker import embed_marker, extract_markers, has_markers
+from .retriever import snapback_tool_def, handle_retrieval
+
+__all__ = [
+    "SnapBackStore",
+    "embed_marker",
+    "extract_markers",
+    "has_markers",
+    "snapback_tool_def",
+    "handle_retrieval",
+]
diff --git a/scripts/lib/snapback/marker.py b/scripts/lib/snapback/marker.py
@@ -0,0 +1,50 @@
+"""SnapBack markers: embed/extract hash references in compressed text.
+
+Part of claw-compactor. License: MIT.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+
+
+MARKER_PATTERN = re.compile(
+    r'\[(\d+) items? compressed to (\d+)\. Retrieve: hash=([a-f0-9]{24})\]'
+)
+
+
+@dataclass(frozen=True)
+class MarkerInfo:
+    original_count: int
+    compressed_count: int
+    hash_id: str
+    span: tuple[int, int]  # (start, end) in text
+
+
+def embed_marker(text: str, original_count: int, compressed_count: int, hash_id: str) -> str:
+    """Append a SnapBack retrieval marker to compressed text."""
+    item_word = "item" if original_count == 1 else "items"
+    marker = f"[{original_count} {item_word} compressed to {compressed_count}. Retrieve: hash={hash_id}]"
+    return f"{text}\n{marker}"
+
+
+def extract_markers(text: str) -> list[MarkerInfo]:
+    """Extract all SnapBack markers from text."""
+    markers = []
+    for match in MARKER_PATTERN.finditer(text):
+        markers.append(MarkerInfo(
+            original_count=int(match.group(1)),
+            compressed_count=int(match.group(2)),
+            hash_id=match.group(3),
+            span=(match.start(), match.end()),
+        ))
+    return markers
+
+
+def has_markers(text: str) -> bool:
+    """Return True if text contains any SnapBack markers."""
+    return bool(MARKER_PATTERN.search(text))
+
+
+def strip_markers(text: str) -> str:
+    """Remove all SnapBack markers from text."""
+    return MARKER_PATTERN.sub("", text).rstrip()
diff --git a/scripts/lib/snapback/retriever.py b/scripts/lib/snapback/retriever.py
@@ -0,0 +1,75 @@
+"""SnapBack retriever: generates tool definitions and handles retrieval calls.
+
+Part of claw-compactor. License: MIT.
+"""
+from __future__ import annotations
+from typing import Any
+from .store import SnapBackStore
+
+
+TOOL_NAME = "snapback_retrieve"
+TOOL_DESCRIPTION = (
+    "Retrieve the original uncompressed content for a compressed section. "
+    "Use this when you need more detail from a section marked with a retrieval hash."
+)
+
+
+def snapback_tool_def(provider: str = "openai") -> dict[str, Any]:
+    """Generate a tool/function definition for the given provider format."""
+    params = {
+        "type": "object",
+        "properties": {
+            "hash_id": {
+                "type": "string",
+                "description": "The 24-character hash ID from the compression marker.",
+            },
+            "keywords": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Optional keywords to filter the retrieved content.",
+            },
+        },
+        "required": ["hash_id"],
+    }
+
+    if provider == "anthropic":
+        return {
+            "name": TOOL_NAME,
+            "description": TOOL_DESCRIPTION,
+            "input_schema": params,
+        }
+    # OpenAI / default
+    return {
+        "type": "function",
+        "function": {
+            "name": TOOL_NAME,
+            "description": TOOL_DESCRIPTION,
+            "parameters": params,
+        },
+    }
+
+
+def handle_retrieval(store: SnapBackStore, tool_call: dict[str, Any]) -> dict[str, Any]:
+    """Process a snapback_retrieve tool call and return the result."""
+    args = tool_call.get("arguments", tool_call.get("input", {}))
+    if isinstance(args, str):
+        import json
+        args = json.loads(args)
+
+    hash_id = args.get("hash_id", "")
+    keywords = args.get("keywords", [])
+
+    if keywords:
+        content = store.search(hash_id, keywords)
+    else:
+        content = store.retrieve(hash_id)
+
+    if content is None:
+        return {
+            "status": "not_found",
+            "message": f"No content found for hash={hash_id}. It may have expired.",
+        }
+    return {
+        "status": "ok",
+        "content": content,
+    }
diff --git a/scripts/lib/snapback/store.py b/scripts/lib/snapback/store.py
@@ -0,0 +1,77 @@
+"""SnapBack store: LRU cache for original text, enabling reversible compression.
+
+Part of claw-compactor. License: MIT.
+"""
+from __future__ import annotations
+import hashlib
+import time
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(frozen=True)
+class CacheEntry:
+    original: str
+    compressed: str
+    stored_at: float
+    original_tokens: int
+    compressed_tokens: int
+
+
+class SnapBackStore:
+    """LRU store mapping hash IDs to original text for later retrieval."""
+
+    def __init__(self, max_entries: int = 500, ttl_seconds: int = 600):
+        self._cache: OrderedDict[str, CacheEntry] = OrderedDict()
+        self.max_entries = max_entries
+        self.ttl_seconds = ttl_seconds
+
+    def store(self, original: str, compressed: str,
+              original_tokens: int = 0, compressed_tokens: int = 0) -> str:
+        """Store original text and return a 24-char hex hash ID."""
+        hash_id = hashlib.sha256(original.encode("utf-8")).hexdigest()[:24]
+        entry = CacheEntry(
+            original=original,
+            compressed=compressed,
+            stored_at=time.monotonic(),
+            original_tokens=original_tokens,
+            compressed_tokens=compressed_tokens,
+        )
+        # Move to end (most recently used)
+        if hash_id in self._cache:
+            self._cache.move_to_end(hash_id)
+        self._cache[hash_id] = entry
+        # Evict oldest if over limit
+        while len(self._cache) > self.max_entries:
+            self._cache.popitem(last=False)
+        return hash_id
+
+    def retrieve(self, hash_id: str) -> Optional[str]:
+        """Retrieve original text by hash ID. Returns None if expired or missing."""
+        entry = self._cache.get(hash_id)
+        if entry is None:
+            return None
+        if time.monotonic() - entry.stored_at > self.ttl_seconds:
+            del self._cache[hash_id]
+            return None
+        self._cache.move_to_end(hash_id)
+        return entry.original
+
+    def search(self, hash_id: str, keywords: list[str]) -> Optional[str]:
+        """Retrieve and filter original text by keywords. Returns matching lines."""
+        original = self.retrieve(hash_id)
+        if original is None:
+            return None
+        if not keywords:
+            return original
+        lines = original.split("\n")
+        matched = [l for l in lines if any(kw.lower() in l.lower() for kw in keywords)]
+        return "\n".join(matched) if matched else original
+
+    @property
+    def size(self) -> int:
+        return len(self._cache)
+
+    def clear(self) -> None:
+        self._cache.clear()
diff --git a/scripts/lib/transforms/__init__.py b/scripts/lib/transforms/__init__.py
@@ -0,0 +1,14 @@
+"""Transform Pipeline framework for Claw Compactor v7.0.
+
+Part of claw-compactor. License: MIT.
+"""
+from lib.transforms.base import Transform, CompressContext, TransformResult
+from lib.transforms.pipeline import Pipeline, PipelineResult
+
+__all__ = [
+    "Transform",
+    "Pipeline",
+    "CompressContext",
+    "TransformResult",
+    "PipelineResult",
+]
diff --git a/scripts/lib/transforms/base.py b/scripts/lib/transforms/base.py
@@ -0,0 +1,71 @@
+"""Transform base classes for Claw Compactor pipeline.
+
+Part of claw-compactor. License: MIT.
+"""
+from __future__ import annotations
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field, replace
+from typing import Any
+
+
+@dataclass(frozen=True)
+class CompressContext:
+    """Immutable context passed through the transform pipeline."""
+    content: str
+    content_type: str = "text"  # text|code|json|log|diff|search
+    language: str | None = None
+    role: str = "user"  # system|user|assistant|tool
+    model: str | None = None
+    token_budget: int | None = None
+    query: str | None = None
+    metadata: dict = field(default_factory=dict)
+
+    def evolve(self, **kwargs) -> CompressContext:
+        """Return a new context with specified fields replaced."""
+        return replace(self, **kwargs)
+
+
+@dataclass(frozen=True)
+class TransformResult:
+    """Immutable result from a single transform."""
+    content: str
+    original_tokens: int = 0
+    compressed_tokens: int = 0
+    markers: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+    timing_ms: float = 0.0
+    skipped: bool = False
+
+
+class Transform(ABC):
+    """Base class for all compression transforms."""
+    name: str = "unnamed"
+    order: int = 50  # execution order (lower = earlier)
+
+    @abstractmethod
+    def should_apply(self, ctx: CompressContext) -> bool:
+        """Return True if this transform should run on the given context."""
+        ...
+
+    @abstractmethod
+    def apply(self, ctx: CompressContext) -> TransformResult:
+        """Apply the transform and return the result."""
+        ...
+
+    def timed_apply(self, ctx: CompressContext) -> TransformResult:
+        """Apply with timing. Used by Pipeline."""
+        if not self.should_apply(ctx):
+            return TransformResult(content=ctx.content, skipped=True)
+        start = time.monotonic()
+        result = self.apply(ctx)
+        elapsed = (time.monotonic() - start) * 1000
+        return TransformResult(
+            content=result.content,
+            original_tokens=result.original_tokens,
+            compressed_tokens=result.compressed_tokens,
+            markers=result.markers,
+            warnings=result.warnings,
+            timing_ms=elapsed,
+            skipped=False,
+        )
diff --git a/scripts/lib/transforms/pipeline.py b/scripts/lib/transforms/pipeline.py
@@ -0,0 +1,85 @@
+"""Pipeline engine: runs a chain of Transforms sequentially.
+
+Part of claw-compactor. License: MIT.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from lib.transforms.base import Transform, CompressContext, TransformResult
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class StepResult:
+    """Result from a single pipeline step."""
+    transform_name: str
+    result: TransformResult
+
+
+@dataclass(frozen=True)
+class PipelineResult:
+    """Aggregated result from running all transforms."""
+    content: str
+    steps: list[StepResult] = field(default_factory=list)
+    total_timing_ms: float = 0.0
+    markers: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+
+
+class Pipeline:
+    """Ordered chain of Transforms."""
+
+    def __init__(self, transforms: list[Transform] | None = None):
+        self._transforms: list[Transform] = sorted(
+            transforms or [], key=lambda t: t.order
+        )
+
+    def add(self, transform: Transform) -> Pipeline:
+        """Return a new Pipeline with the transform added (immutable)."""
+        new_transforms = sorted(
+            [*self._transforms, transform], key=lambda t: t.order
+        )
+        return Pipeline(new_transforms)
+
+    @property
+    def transforms(self) -> list[Transform]:
+        return list(self._transforms)
+
+    def run(self, ctx: CompressContext) -> PipelineResult:
+        """Run all transforms sequentially. Each transform's output feeds the next."""
+        steps: list[StepResult] = []
+        all_markers: list[str] = []
+        all_warnings: list[str] = []
+        total_ms = 0.0
+        current_ctx = ctx
+
+        for transform in self._transforms:
+            result = transform.timed_apply(current_ctx)
+            steps.append(StepResult(
+                transform_name=transform.name,
+                result=result,
+            ))
+            total_ms += result.timing_ms
+
+            if not result.skipped:
+                current_ctx = current_ctx.evolve(content=result.content)
+                all_markers.extend(result.markers)
+                all_warnings.extend(result.warnings)
+                logger.debug(
+                    "%s: %d→%d tokens (%.1fms)",
+                    transform.name,
+                    result.original_tokens,
+                    result.compressed_tokens,
+                    result.timing_ms,
+                )
+            else:
+                logger.debug("%s: skipped", transform.name)
+
+        return PipelineResult(
+            content=current_ctx.content,
+            steps=steps,
+            total_timing_ms=total_ms,
+            markers=all_markers,
+            warnings=all_warnings,
+        )
diff --git a/tests/test_snapback.py b/tests/test_snapback.py
diff --git a/tests/test_transform_pipeline.py b/tests/test_transform_pipeline.py