Skip to content

Commit c8df98e

Browse files
aeromomoclaude
andcommitted
feat: Phase 2 — Cortex content router + Neurosyntax AST code compressor
- Cortex (order=5): rule-based content type detection (code/json/log/diff/search/text) with language fingerprinting (Python/JS/TS/Go/Rust/Java + 10 more) - Neurosyntax (order=25): AST-aware code compression via tree-sitter with safe regex fallback - Never shortens identifiers or modifies string literals - Removes pure comments, collapses docstrings, deduplicates blank lines - FusionResult now supports context_updates for cross-stage context evolution - 51 new tests (18 cortex + 31 neurosyntax + 2 integration), 1077 total passed Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1a785e8 commit c8df98e

7 files changed

Lines changed: 1307 additions & 1 deletion

File tree

scripts/lib/fusion/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ class FusionResult:
3636
warnings: list[str] = field(default_factory=list)
3737
timing_ms: float = 0.0
3838
skipped: bool = False
39+
# Optional overrides applied to FusionContext after this stage runs.
40+
# Keys must match FusionContext field names (e.g. content_type, language).
41+
context_updates: dict[str, Any] = field(default_factory=dict)
3942

4043

4144
class FusionStage(ABC):
@@ -68,4 +71,5 @@ def timed_apply(self, ctx: FusionContext) -> FusionResult:
6871
warnings=result.warnings,
6972
timing_ms=elapsed,
7073
skipped=False,
74+
context_updates=result.context_updates,
7175
)
Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
"""Rule-based content type detector for the Fusion Pipeline Cortex.
2+
3+
Detection priority (highest confidence first):
4+
1. Markdown code fences → code + language (0.95)
5+
2. Diff headers → diff (0.95)
6+
3. JSON parse → json (0.90)
7+
4. Shebang line → code + language (0.90)
8+
5. Log line density → log (0.80)
9+
6. Search result density → search (0.80)
10+
7. Code keyword density → code (0.70)
11+
8. Fallback → text (0.50)
12+
13+
Part of claw-compactor. License: MIT.
14+
"""
15+
from __future__ import annotations
16+
17+
import json
18+
import re
19+
from dataclasses import dataclass
20+
21+
22+
# ---------------------------------------------------------------------------
23+
# Public types
24+
# ---------------------------------------------------------------------------
25+
26+
@dataclass(frozen=True)
27+
class DetectionResult:
28+
content_type: str # text | code | json | log | diff | search
29+
language: str | None
30+
confidence: float # 0.0 – 1.0
31+
32+
33+
@dataclass(frozen=True)
34+
class Section:
35+
content: str
36+
content_type: str
37+
language: str | None
38+
start_line: int
39+
end_line: int
40+
41+
42+
# ---------------------------------------------------------------------------
43+
# Regex constants
44+
# ---------------------------------------------------------------------------
45+
46+
# Code fence: ```lang or ~~~lang (lang optional)
47+
_FENCE_OPEN = re.compile(r"^(`{3,}|~{3,})([\w+-]*)$", re.MULTILINE)
48+
_FENCE_CLOSE_BACKTICK = re.compile(r"^`{3,}\s*$", re.MULTILINE)
49+
_FENCE_CLOSE_TILDE = re.compile(r"^~{3,}\s*$", re.MULTILINE)
50+
51+
# Diff
52+
_DIFF_HEADER = re.compile(r"^(--- a/|\+\+\+ b/|@@ .* @@)", re.MULTILINE)
53+
54+
# JSON first char
55+
_JSON_START = re.compile(r"^\s*[\[{]")
56+
57+
# Shebang
58+
_SHEBANG = re.compile(r"^#!")
59+
60+
# Log line: leading timestamp + log level keyword
61+
_LOG_LINE = re.compile(
62+
r"(?:"
63+
r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}" # ISO timestamp
64+
r"|"
65+
r"\[?\d{2}[:/]\d{2}[:/]\d{2}\]?" # HH:MM:SS
66+
r")"
67+
r".{0,40}"
68+
r"\b(?:INFO|WARN(?:ING)?|ERROR|DEBUG|FATAL|TRACE|CRITICAL)\b",
69+
re.IGNORECASE,
70+
)
71+
72+
# Search result: path:lineno: content (grep/rg style)
73+
_SEARCH_LINE = re.compile(r"^[^\s:][^:]*:\d+[:\s]")
74+
75+
# Code keywords (per-line density check)
76+
_CODE_KEYWORDS = re.compile(
77+
r"\b(?:import|from|def |class |function |const |let |var |return|if |else |"
78+
r"for |while |switch |case |elif |endif|public |private |protected |"
79+
r"static |void |int |str |bool |fn |func |package |use )\b"
80+
)
81+
82+
# Language fingerprints for content-based detection (no fence)
83+
_LANG_PATTERNS: list[tuple[str, re.Pattern[str]]] = [
84+
("python", re.compile(r"\bdef \w+\(|^from \w+ import |^import \w|class \w+\s*:", re.MULTILINE)),
85+
("go", re.compile(r"^package \w|^func \w+\(|^import \(", re.MULTILINE)),
86+
("rust", re.compile(r"\bfn \w+\(|let mut |^impl |^use \w", re.MULTILINE)),
87+
("java", re.compile(r"\bpublic class |\bprivate |\bprotected |\bpublic static void main\b")),
88+
("typescript", re.compile(r"\b(const|let|var)\b\s+\w+\s*:\s*\w+|interface \w+\s*\{|export type |:\s*(string|number|boolean|any|void|never)\b")),
89+
("javascript", re.compile(r"\b(const|let|var)\b|\bfunction\b|\b=>\b|\bexport\b|\brequire\s*\(")),
90+
("css", re.compile(r"^\s*[\w#.:\[*][^{]*\{\s*$", re.MULTILINE)),
91+
("html", re.compile(r"<(!DOCTYPE|html|head|body|div|span|p|a)\b", re.IGNORECASE)),
92+
("sql", re.compile(r"\b(SELECT|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|FROM|WHERE)\b", re.IGNORECASE)),
93+
("yaml", re.compile(r"^\w[\w\s]*:\s*\S", re.MULTILINE)),
94+
]
95+
96+
# Fence language aliases → canonical name
97+
_FENCE_LANG_MAP: dict[str, str] = {
98+
"py": "python",
99+
"python": "python",
100+
"python3": "python",
101+
"js": "javascript",
102+
"javascript": "javascript",
103+
"jsx": "javascript",
104+
"ts": "typescript",
105+
"typescript": "typescript",
106+
"tsx": "typescript",
107+
"java": "java",
108+
"go": "go",
109+
"golang": "go",
110+
"rs": "rust",
111+
"rust": "rust",
112+
"c": "c",
113+
"cpp": "cpp",
114+
"c++": "cpp",
115+
"cxx": "cpp",
116+
"rb": "ruby",
117+
"ruby": "ruby",
118+
"php": "php",
119+
"sh": "shell",
120+
"bash": "shell",
121+
"shell": "shell",
122+
"zsh": "shell",
123+
"fish": "shell",
124+
"sql": "sql",
125+
"yaml": "yaml",
126+
"yml": "yaml",
127+
"toml": "toml",
128+
"html": "html",
129+
"css": "css",
130+
"json": "json",
131+
"xml": "xml",
132+
"md": "markdown",
133+
"markdown": "markdown",
134+
}
135+
136+
# Shebang interpreter → language
137+
_SHEBANG_LANG: list[tuple[re.Pattern[str], str]] = [
138+
(re.compile(r"python"), "python"),
139+
(re.compile(r"node|nodejs"), "javascript"),
140+
(re.compile(r"ruby"), "ruby"),
141+
(re.compile(r"php"), "php"),
142+
(re.compile(r"perl"), "perl"),
143+
(re.compile(r"bash|sh|zsh|fish|dash"), "shell"),
144+
(re.compile(r"env\s+(\w+)"), None), # handled specially below
145+
]
146+
147+
148+
# ---------------------------------------------------------------------------
149+
# Detector
150+
# ---------------------------------------------------------------------------
151+
152+
class ContentDetector:
153+
"""Rule-based content type detector."""
154+
155+
# -- Public API ----------------------------------------------------------
156+
157+
def detect(self, text: str) -> DetectionResult:
158+
"""Detect content type from text. Returns best match."""
159+
if not text or not text.strip():
160+
return DetectionResult("text", None, 0.5)
161+
162+
# 1. Markdown code fence
163+
fence_result = self._check_code_fence(text)
164+
if fence_result is not None:
165+
return fence_result
166+
167+
# 2. Diff headers
168+
if self._check_diff(text):
169+
return DetectionResult("diff", None, 0.95)
170+
171+
# 3. JSON
172+
if self._check_json(text):
173+
return DetectionResult("json", None, 0.9)
174+
175+
# 4. Shebang
176+
shebang_result = self._check_shebang(text)
177+
if shebang_result is not None:
178+
return shebang_result
179+
180+
lines = text.splitlines()
181+
non_empty = [ln for ln in lines if ln.strip()]
182+
total = max(len(non_empty), 1)
183+
184+
# 5. Log density
185+
log_hits = sum(1 for ln in non_empty if _LOG_LINE.search(ln))
186+
if log_hits / total > 0.30:
187+
return DetectionResult("log", None, 0.8)
188+
189+
# 6. Search result density
190+
search_hits = sum(1 for ln in non_empty if _SEARCH_LINE.match(ln))
191+
if search_hits / total > 0.40:
192+
return DetectionResult("search", None, 0.8)
193+
194+
# 7. Code keyword density
195+
kw_hits = sum(1 for ln in non_empty if _CODE_KEYWORDS.search(ln))
196+
if kw_hits / total > 0.15:
197+
lang = self.detect_language(text)
198+
return DetectionResult("code", lang, 0.7)
199+
200+
return DetectionResult("text", None, 0.5)
201+
202+
def detect_language(self, text: str) -> str | None:
203+
"""Detect programming language from code text (no fence context)."""
204+
for lang, pattern in _LANG_PATTERNS:
205+
if pattern.search(text):
206+
return lang
207+
return None
208+
209+
def detect_sections(self, text: str) -> list[Section]:
210+
"""Split mixed content into typed sections (text interleaved with code fences)."""
211+
sections: list[Section] = []
212+
lines = text.splitlines(keepends=True)
213+
i = 0
214+
text_start = 0
215+
216+
while i < len(lines):
217+
stripped = lines[i].rstrip("\n\r")
218+
m = _FENCE_OPEN.match(stripped)
219+
if m is None:
220+
i += 1
221+
continue
222+
223+
# Flush preceding text block
224+
if i > text_start:
225+
block = "".join(lines[text_start:i])
226+
sections.append(self._classify_block(block, text_start + 1, i))
227+
228+
fence_char = m.group(1)[0]
229+
raw_lang = m.group(2).strip().lower()
230+
lang = _FENCE_LANG_MAP.get(raw_lang) or (raw_lang or None)
231+
fence_start = i
232+
close_pat = _FENCE_CLOSE_BACKTICK if fence_char == "`" else _FENCE_CLOSE_TILDE
233+
234+
i += 1
235+
while i < len(lines) and not close_pat.match(lines[i].rstrip("\n\r")):
236+
i += 1
237+
238+
code_lines = lines[fence_start: i + 1]
239+
code_block = "".join(code_lines)
240+
sections.append(Section(
241+
content=code_block,
242+
content_type="code",
243+
language=lang,
244+
start_line=fence_start + 1,
245+
end_line=i + 1,
246+
))
247+
i += 1
248+
text_start = i
249+
250+
# Trailing text
251+
if text_start < len(lines):
252+
block = "".join(lines[text_start:])
253+
sections.append(self._classify_block(block, text_start + 1, len(lines)))
254+
255+
return sections
256+
257+
# -- Private helpers -----------------------------------------------------
258+
259+
def _check_code_fence(self, text: str) -> DetectionResult | None:
260+
m = _FENCE_OPEN.search(text)
261+
if m is None:
262+
return None
263+
raw_lang = m.group(2).strip().lower()
264+
lang = _FENCE_LANG_MAP.get(raw_lang) or (raw_lang or None)
265+
return DetectionResult("code", lang, 0.95)
266+
267+
def _check_diff(self, text: str) -> bool:
268+
matches = _DIFF_HEADER.findall(text)
269+
return len(matches) >= 2
270+
271+
def _check_json(self, text: str) -> bool:
272+
stripped = text.strip()
273+
if not stripped or stripped[0] not in ("{", "["):
274+
return False
275+
try:
276+
json.loads(stripped)
277+
return True
278+
except (json.JSONDecodeError, ValueError):
279+
return False
280+
281+
def _check_shebang(self, text: str) -> DetectionResult | None:
282+
first_line = text.split("\n", 1)[0]
283+
if not _SHEBANG.match(first_line):
284+
return None
285+
lang = self._lang_from_shebang(first_line)
286+
return DetectionResult("code", lang, 0.9)
287+
288+
def _lang_from_shebang(self, shebang: str) -> str | None:
289+
for pattern, lang in _SHEBANG_LANG:
290+
m = pattern.search(shebang)
291+
if m:
292+
if lang is not None:
293+
return lang
294+
# env case: look at captured interpreter name
295+
interpreter = m.group(1).lower() if m.lastindex else ""
296+
return _FENCE_LANG_MAP.get(interpreter, interpreter or None)
297+
return None
298+
299+
def _classify_block(self, block: str, start_line: int, end_line: int) -> Section:
300+
result = self.detect(block)
301+
return Section(
302+
content=block,
303+
content_type=result.content_type,
304+
language=result.language,
305+
start_line=start_line,
306+
end_line=end_line,
307+
)

scripts/lib/fusion/cortex.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""Cortex — intelligent content router for the Fusion Pipeline.
2+
3+
Runs first (order=5) and detects content_type + language, propagating
4+
them into FusionContext so downstream stages can make type-aware decisions.
5+
6+
Part of claw-compactor. License: MIT.
7+
"""
8+
from __future__ import annotations
9+
10+
from lib.fusion.base import FusionContext, FusionResult, FusionStage
11+
from lib.fusion.content_detector import ContentDetector
12+
from lib.tokens import estimate_tokens
13+
14+
15+
class Cortex(FusionStage):
16+
"""Intelligent content router. Detects content type and routes to appropriate compressors."""
17+
18+
name = "cortex"
19+
order = 5 # must run before all compressor stages
20+
21+
def __init__(self) -> None:
22+
self.detector = ContentDetector()
23+
24+
def should_apply(self, ctx: FusionContext) -> bool:
25+
# Skip if a caller has already made an explicit type decision (non-default value).
26+
return ctx.content_type == "text"
27+
28+
def apply(self, ctx: FusionContext) -> FusionResult:
29+
detection = self.detector.detect(ctx.content)
30+
tokens = estimate_tokens(ctx.content)
31+
32+
context_updates: dict[str, object] = {
33+
"content_type": detection.content_type,
34+
}
35+
if detection.language is not None:
36+
context_updates["language"] = detection.language
37+
38+
return FusionResult(
39+
content=ctx.content,
40+
original_tokens=tokens,
41+
compressed_tokens=tokens, # Cortex never modifies content
42+
skipped=False,
43+
context_updates=context_updates,
44+
)

0 commit comments

Comments
 (0)