Skip to content

Commit b2687b7

Browse files
aeromomoclaude
andcommitted
feat: Phase 1 — Transform Pipeline + SnapBack reversible compression
- Transform base class + Pipeline engine (ordered chain, immutable context) - SnapBack store (LRU + TTL), marker embed/extract, provider-agnostic tool defs - 107 new tests (43 pipeline + 64 snapback), 1026 total passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ddfe09c commit b2687b7

10 files changed

Lines changed: 2170 additions & 0 deletions

File tree

EVOLUTION_PLAN.md

Lines changed: 711 additions & 0 deletions
Large diffs are not rendered by default.

scripts/lib/snapback/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""SnapBack reversible compression engine for Claw Compactor v7.0.
2+
3+
Part of claw-compactor. License: MIT.
4+
"""
5+
from .store import SnapBackStore
6+
from .marker import embed_marker, extract_markers, has_markers
7+
from .retriever import snapback_tool_def, handle_retrieval
8+
9+
__all__ = [
10+
"SnapBackStore",
11+
"embed_marker",
12+
"extract_markers",
13+
"has_markers",
14+
"snapback_tool_def",
15+
"handle_retrieval",
16+
]

scripts/lib/snapback/marker.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""SnapBack markers: embed/extract hash references in compressed text.
2+
3+
Part of claw-compactor. License: MIT.
4+
"""
5+
from __future__ import annotations
6+
import re
7+
from dataclasses import dataclass
8+
9+
10+
MARKER_PATTERN = re.compile(
11+
r'\[(\d+) items? compressed to (\d+)\. Retrieve: hash=([a-f0-9]{24})\]'
12+
)
13+
14+
15+
@dataclass(frozen=True)
16+
class MarkerInfo:
17+
original_count: int
18+
compressed_count: int
19+
hash_id: str
20+
span: tuple[int, int] # (start, end) in text
21+
22+
23+
def embed_marker(text: str, original_count: int, compressed_count: int, hash_id: str) -> str:
24+
"""Append a SnapBack retrieval marker to compressed text."""
25+
item_word = "item" if original_count == 1 else "items"
26+
marker = f"[{original_count} {item_word} compressed to {compressed_count}. Retrieve: hash={hash_id}]"
27+
return f"{text}\n{marker}"
28+
29+
30+
def extract_markers(text: str) -> list[MarkerInfo]:
31+
"""Extract all SnapBack markers from text."""
32+
markers = []
33+
for match in MARKER_PATTERN.finditer(text):
34+
markers.append(MarkerInfo(
35+
original_count=int(match.group(1)),
36+
compressed_count=int(match.group(2)),
37+
hash_id=match.group(3),
38+
span=(match.start(), match.end()),
39+
))
40+
return markers
41+
42+
43+
def has_markers(text: str) -> bool:
44+
"""Return True if text contains any SnapBack markers."""
45+
return bool(MARKER_PATTERN.search(text))
46+
47+
48+
def strip_markers(text: str) -> str:
49+
"""Remove all SnapBack markers from text."""
50+
return MARKER_PATTERN.sub("", text).rstrip()

scripts/lib/snapback/retriever.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""SnapBack retriever: generates tool definitions and handles retrieval calls.
2+
3+
Part of claw-compactor. License: MIT.
4+
"""
5+
from __future__ import annotations
6+
from typing import Any
7+
from .store import SnapBackStore
8+
9+
10+
TOOL_NAME = "snapback_retrieve"
11+
TOOL_DESCRIPTION = (
12+
"Retrieve the original uncompressed content for a compressed section. "
13+
"Use this when you need more detail from a section marked with a retrieval hash."
14+
)
15+
16+
17+
def snapback_tool_def(provider: str = "openai") -> dict[str, Any]:
18+
"""Generate a tool/function definition for the given provider format."""
19+
params = {
20+
"type": "object",
21+
"properties": {
22+
"hash_id": {
23+
"type": "string",
24+
"description": "The 24-character hash ID from the compression marker.",
25+
},
26+
"keywords": {
27+
"type": "array",
28+
"items": {"type": "string"},
29+
"description": "Optional keywords to filter the retrieved content.",
30+
},
31+
},
32+
"required": ["hash_id"],
33+
}
34+
35+
if provider == "anthropic":
36+
return {
37+
"name": TOOL_NAME,
38+
"description": TOOL_DESCRIPTION,
39+
"input_schema": params,
40+
}
41+
# OpenAI / default
42+
return {
43+
"type": "function",
44+
"function": {
45+
"name": TOOL_NAME,
46+
"description": TOOL_DESCRIPTION,
47+
"parameters": params,
48+
},
49+
}
50+
51+
52+
def handle_retrieval(store: SnapBackStore, tool_call: dict[str, Any]) -> dict[str, Any]:
53+
"""Process a snapback_retrieve tool call and return the result."""
54+
args = tool_call.get("arguments", tool_call.get("input", {}))
55+
if isinstance(args, str):
56+
import json
57+
args = json.loads(args)
58+
59+
hash_id = args.get("hash_id", "")
60+
keywords = args.get("keywords", [])
61+
62+
if keywords:
63+
content = store.search(hash_id, keywords)
64+
else:
65+
content = store.retrieve(hash_id)
66+
67+
if content is None:
68+
return {
69+
"status": "not_found",
70+
"message": f"No content found for hash={hash_id}. It may have expired.",
71+
}
72+
return {
73+
"status": "ok",
74+
"content": content,
75+
}

scripts/lib/snapback/store.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""SnapBack store: LRU cache for original text, enabling reversible compression.
2+
3+
Part of claw-compactor. License: MIT.
4+
"""
5+
from __future__ import annotations
6+
import hashlib
7+
import time
8+
from collections import OrderedDict
9+
from dataclasses import dataclass
10+
from typing import Optional
11+
12+
13+
@dataclass(frozen=True)
14+
class CacheEntry:
15+
original: str
16+
compressed: str
17+
stored_at: float
18+
original_tokens: int
19+
compressed_tokens: int
20+
21+
22+
class SnapBackStore:
23+
"""LRU store mapping hash IDs to original text for later retrieval."""
24+
25+
def __init__(self, max_entries: int = 500, ttl_seconds: int = 600):
26+
self._cache: OrderedDict[str, CacheEntry] = OrderedDict()
27+
self.max_entries = max_entries
28+
self.ttl_seconds = ttl_seconds
29+
30+
def store(self, original: str, compressed: str,
31+
original_tokens: int = 0, compressed_tokens: int = 0) -> str:
32+
"""Store original text and return a 24-char hex hash ID."""
33+
hash_id = hashlib.sha256(original.encode("utf-8")).hexdigest()[:24]
34+
entry = CacheEntry(
35+
original=original,
36+
compressed=compressed,
37+
stored_at=time.monotonic(),
38+
original_tokens=original_tokens,
39+
compressed_tokens=compressed_tokens,
40+
)
41+
# Move to end (most recently used)
42+
if hash_id in self._cache:
43+
self._cache.move_to_end(hash_id)
44+
self._cache[hash_id] = entry
45+
# Evict oldest if over limit
46+
while len(self._cache) > self.max_entries:
47+
self._cache.popitem(last=False)
48+
return hash_id
49+
50+
def retrieve(self, hash_id: str) -> Optional[str]:
51+
"""Retrieve original text by hash ID. Returns None if expired or missing."""
52+
entry = self._cache.get(hash_id)
53+
if entry is None:
54+
return None
55+
if time.monotonic() - entry.stored_at > self.ttl_seconds:
56+
del self._cache[hash_id]
57+
return None
58+
self._cache.move_to_end(hash_id)
59+
return entry.original
60+
61+
def search(self, hash_id: str, keywords: list[str]) -> Optional[str]:
62+
"""Retrieve and filter original text by keywords. Returns matching lines."""
63+
original = self.retrieve(hash_id)
64+
if original is None:
65+
return None
66+
if not keywords:
67+
return original
68+
lines = original.split("\n")
69+
matched = [l for l in lines if any(kw.lower() in l.lower() for kw in keywords)]
70+
return "\n".join(matched) if matched else original
71+
72+
@property
73+
def size(self) -> int:
74+
return len(self._cache)
75+
76+
def clear(self) -> None:
77+
self._cache.clear()

scripts/lib/transforms/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""Transform Pipeline framework for Claw Compactor v7.0.
2+
3+
Part of claw-compactor. License: MIT.
4+
"""
5+
from lib.transforms.base import Transform, CompressContext, TransformResult
6+
from lib.transforms.pipeline import Pipeline, PipelineResult
7+
8+
__all__ = [
9+
"Transform",
10+
"Pipeline",
11+
"CompressContext",
12+
"TransformResult",
13+
"PipelineResult",
14+
]

scripts/lib/transforms/base.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Transform base classes for Claw Compactor pipeline.
2+
3+
Part of claw-compactor. License: MIT.
4+
"""
5+
from __future__ import annotations
6+
import time
7+
from abc import ABC, abstractmethod
8+
from dataclasses import dataclass, field, replace
9+
from typing import Any
10+
11+
12+
@dataclass(frozen=True)
13+
class CompressContext:
14+
"""Immutable context passed through the transform pipeline."""
15+
content: str
16+
content_type: str = "text" # text|code|json|log|diff|search
17+
language: str | None = None
18+
role: str = "user" # system|user|assistant|tool
19+
model: str | None = None
20+
token_budget: int | None = None
21+
query: str | None = None
22+
metadata: dict = field(default_factory=dict)
23+
24+
def evolve(self, **kwargs) -> CompressContext:
25+
"""Return a new context with specified fields replaced."""
26+
return replace(self, **kwargs)
27+
28+
29+
@dataclass(frozen=True)
30+
class TransformResult:
31+
"""Immutable result from a single transform."""
32+
content: str
33+
original_tokens: int = 0
34+
compressed_tokens: int = 0
35+
markers: list[str] = field(default_factory=list)
36+
warnings: list[str] = field(default_factory=list)
37+
timing_ms: float = 0.0
38+
skipped: bool = False
39+
40+
41+
class Transform(ABC):
42+
"""Base class for all compression transforms."""
43+
name: str = "unnamed"
44+
order: int = 50 # execution order (lower = earlier)
45+
46+
@abstractmethod
47+
def should_apply(self, ctx: CompressContext) -> bool:
48+
"""Return True if this transform should run on the given context."""
49+
...
50+
51+
@abstractmethod
52+
def apply(self, ctx: CompressContext) -> TransformResult:
53+
"""Apply the transform and return the result."""
54+
...
55+
56+
def timed_apply(self, ctx: CompressContext) -> TransformResult:
57+
"""Apply with timing. Used by Pipeline."""
58+
if not self.should_apply(ctx):
59+
return TransformResult(content=ctx.content, skipped=True)
60+
start = time.monotonic()
61+
result = self.apply(ctx)
62+
elapsed = (time.monotonic() - start) * 1000
63+
return TransformResult(
64+
content=result.content,
65+
original_tokens=result.original_tokens,
66+
compressed_tokens=result.compressed_tokens,
67+
markers=result.markers,
68+
warnings=result.warnings,
69+
timing_ms=elapsed,
70+
skipped=False,
71+
)

scripts/lib/transforms/pipeline.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""Pipeline engine: runs a chain of Transforms sequentially.
2+
3+
Part of claw-compactor. License: MIT.
4+
"""
5+
from __future__ import annotations
6+
import logging
7+
from dataclasses import dataclass, field
8+
from lib.transforms.base import Transform, CompressContext, TransformResult
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
@dataclass(frozen=True)
14+
class StepResult:
15+
"""Result from a single pipeline step."""
16+
transform_name: str
17+
result: TransformResult
18+
19+
20+
@dataclass(frozen=True)
21+
class PipelineResult:
22+
"""Aggregated result from running all transforms."""
23+
content: str
24+
steps: list[StepResult] = field(default_factory=list)
25+
total_timing_ms: float = 0.0
26+
markers: list[str] = field(default_factory=list)
27+
warnings: list[str] = field(default_factory=list)
28+
29+
30+
class Pipeline:
31+
"""Ordered chain of Transforms."""
32+
33+
def __init__(self, transforms: list[Transform] | None = None):
34+
self._transforms: list[Transform] = sorted(
35+
transforms or [], key=lambda t: t.order
36+
)
37+
38+
def add(self, transform: Transform) -> Pipeline:
39+
"""Return a new Pipeline with the transform added (immutable)."""
40+
new_transforms = sorted(
41+
[*self._transforms, transform], key=lambda t: t.order
42+
)
43+
return Pipeline(new_transforms)
44+
45+
@property
46+
def transforms(self) -> list[Transform]:
47+
return list(self._transforms)
48+
49+
def run(self, ctx: CompressContext) -> PipelineResult:
50+
"""Run all transforms sequentially. Each transform's output feeds the next."""
51+
steps: list[StepResult] = []
52+
all_markers: list[str] = []
53+
all_warnings: list[str] = []
54+
total_ms = 0.0
55+
current_ctx = ctx
56+
57+
for transform in self._transforms:
58+
result = transform.timed_apply(current_ctx)
59+
steps.append(StepResult(
60+
transform_name=transform.name,
61+
result=result,
62+
))
63+
total_ms += result.timing_ms
64+
65+
if not result.skipped:
66+
current_ctx = current_ctx.evolve(content=result.content)
67+
all_markers.extend(result.markers)
68+
all_warnings.extend(result.warnings)
69+
logger.debug(
70+
"%s: %d→%d tokens (%.1fms)",
71+
transform.name,
72+
result.original_tokens,
73+
result.compressed_tokens,
74+
result.timing_ms,
75+
)
76+
else:
77+
logger.debug("%s: skipped", transform.name)
78+
79+
return PipelineResult(
80+
content=current_ctx.content,
81+
steps=steps,
82+
total_timing_ms=total_ms,
83+
markers=all_markers,
84+
warnings=all_warnings,
85+
)

0 commit comments

Comments
 (0)