Release 1.1.2

Flamehaven CI · Flamehaven CI · commit b4c39c3a6132 · 2025-12-09T15:13:58.000+07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.1.2] - 2025-12-09
+
+### Security
+- Masking now pre-compiles basic/advanced regexes and skips processing when input exceeds a safe threshold to reduce ReDoS risk.
+- Large individual files are skipped before read when they exceed 1MB, preventing OOM/hangs while still noting the skip.
+
+### Performance
+- Token estimation is cached with LRU (maxsize 2048) and keeps a minimum of one token for empty strings.
+
+### Tests
+- Pytest suite: 22 passed, 2 skipped.
+
 ## [1.1.1] - 2025-12-04
 
 ### Removed
diff --git a/README.md b/README.md
@@ -20,6 +20,11 @@ Dir2md converts directory structures into AI-friendly markdown with intelligent
 
 **New to Dir2md?** Check out **[Wiki.md](docs/Wiki.md)** for a friendly introduction with examples.
 
+## Fresh highlights (1.1.2)
+- Masking regexes are pre-compiled with a large-input guard to reduce ReDoS risk.
+- Single-file reads are capped at 1MB and cleanly skipped with warnings to avoid OOM/hangs.
+- Token estimation is now LRU-cached (minimum 1 token) for faster repeated calculations.
+
 ### Try Online
 [**Dir2md Demo on Hugging Face Spaces**](https://huggingface.co/spaces/Flamehaven/dir2md-demo) — No installation required
 
diff --git a/demo/README.md b/demo/README.md
@@ -32,11 +32,10 @@ Convert any public GitHub repository into an LLM-ready markdown blueprint plus o
 2) Choose options: include contents, emit manifest, enable spicy/strict.  
 3) Run and download the markdown/JSONL outputs.
 
-## Fresh highlights (1.1.0)
-- `--fast` preset (tree + manifest only, no file reads).
-- Default dual outputs (md + jsonl) for human + LLM.
-- Spicy risk report (`--spicy`, `--spicy-strict`) with 5 severity levels.
-- Modular pipeline (`walker`, `selector`, `renderer`, `orchestrator`) for cleaner extensibility.
+## Fresh highlights (1.1.2)
+- Masking regex pre-compilation with a large-input guard to reduce ReDoS risk.
+- Single-file read cap at 1MB with skip warnings to avoid OOM/hangs.
+- Token estimation now LRU-cached (minimum 1 token) for faster repeated calculations.
 
 ## Notes
 - Current Gradio SDK: **5.45.0**. A newer **6.0.2** is available; update `sdk_version` and `gradio` pin if you want to try it.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dir2md"
-version = "1.1.0"
+version = "1.1.2"
 description = "Generate a Markdown blueprint: directory tree + optional file contents (token-optimized, ICEBERG preset)"
 readme = "README.md"
 authors = [{name = "Flamehaven", email = "info@flamehaven.space"}]
diff --git a/src/dir2md/__init__.py b/src/dir2md/__init__.py
@@ -3,4 +3,4 @@
 from .core import Config, generate_markdown_report
 
 __all__ = ["__version__", "apply_masking", "Config", "generate_markdown_report"]
-__version__ = "1.1.0"
+__version__ = "1.1.2"
diff --git a/src/dir2md/masking.py b/src/dir2md/masking.py
@@ -1,5 +1,8 @@
-﻿import re
-from typing import Iterable
+import re
+from typing import Iterable, Dict, Pattern
+
+# Guard extremely large inputs from expensive regex processing (ReDoS safety)
+MAX_MASK_INPUT_CHARS = 1_000_000
 
 # Basic masking rules (available in OSS version)
 BASIC_MASKING_RULES = {
@@ -28,38 +31,57 @@
 PRO_MASK_REPLACEMENT = "[*** MASKED_SECRET_PRO ***]"
 CUSTOM_MASK_REPLACEMENT = "[*** MASKED_SECRET ***]"
 
-def get_active_masking_rules(mode: str = "basic"):
-    """Get masking rules; advanced mode always allowed in OSS build."""
-    rules = BASIC_MASKING_RULES.copy()
+# Pre-compiled regex cache for faster and safer masking
+_COMPILED_RULES: Dict[str, Dict[str, Pattern[str]]] = {"basic": {}, "advanced": {}}
+
+
+def _compile_rules() -> None:
+    """Compile masking regex patterns at import time."""
+    for name, pattern in BASIC_MASKING_RULES.items():
+        flags = re.DOTALL if name == "PRIVATE_KEY" else 0
+        _COMPILED_RULES["basic"][name] = re.compile(pattern, flags=flags)
+
+    _COMPILED_RULES["advanced"].update(_COMPILED_RULES["basic"])
+    for name, pattern in ADVANCED_MASKING_RULES.items():
+        _COMPILED_RULES["advanced"][name] = re.compile(pattern)
+
+
+_compile_rules()
 
+
+def get_active_masking_rules(mode: str = "basic") -> dict[str, Pattern[str]]:
+    """Return compiled masking rules for the requested mode."""
     if mode == "advanced":
-        rules.update(ADVANCED_MASKING_RULES)
+        return _COMPILED_RULES["advanced"]
+    return _COMPILED_RULES["basic"]
 
-    return rules
 
 def apply_masking(text: str, mode: str = "basic", custom_patterns: Iterable[str] | None = None) -> str:
     """Apply masking rules with optional custom patterns (no license gating)."""
 
+    if not text:
+        return text
+
     if mode == "off":
-        rules = {}
+        rules: dict[str, Pattern[str]] = {}
     else:
         rules = get_active_masking_rules(mode)
-    
+
+    # Anti-ReDoS: avoid running regexes on extremely large payloads
+    if len(text) > MAX_MASK_INPUT_CHARS:
+        return text
+
     # Apply custom patterns first so project-specific rules run before bundled ones.
     if custom_patterns:
         for pattern in custom_patterns:
             try:
-                text = re.sub(pattern, CUSTOM_MASK_REPLACEMENT, text, flags=re.DOTALL)
+                compiled = re.compile(pattern, flags=re.DOTALL)
+                text = compiled.sub(CUSTOM_MASK_REPLACEMENT, text)
             except re.error as exc:
                 print(f"[WARN] Skipping invalid custom mask pattern: {pattern!r} ({exc})")
 
     for rule_name, pattern in rules.items():
         replacement = PRO_MASK_REPLACEMENT if rule_name in ADVANCED_MASKING_RULES else MASK_REPLACEMENT
-        # Use DOTALL flag for private keys to match across newlines
-        if rule_name == "PRIVATE_KEY":
-            text = re.sub(pattern, replacement, text, flags=re.DOTALL)
-        else:
-            text = re.sub(pattern, replacement, text)
-
-    return text
-
+        text = pattern.sub(replacement, text)
+
+    return text
diff --git a/src/dir2md/selector.py b/src/dir2md/selector.py
@@ -10,6 +10,8 @@
 from .summary import summarize
 from .search import match_query_snippet
 
+SINGLE_FILE_MAX_BYTES = 1 * 1024 * 1024  # 1MB guard per file
+
 
 def build_candidates(cfg, files: List[Path], root: Path, is_included, is_omitted) -> Tuple[List[dict], Dict[Path, str]]:
     candidates: list[dict] = []
@@ -35,6 +37,35 @@ def build_candidates(cfg, files: List[Path], root: Path, is_included, is_omitted
             except ValueError:
                 continue
 
+        try:
+            size = f.stat().st_size
+        except OSError:
+            continue
+
+        if size > SINGLE_FILE_MAX_BYTES:
+            print(f"[WARN] Skipping {f} ({size} bytes > {SINGLE_FILE_MAX_BYTES} bytes limit)")
+            text = f"<Skipped: File too large ({size} bytes > {SINGLE_FILE_MAX_BYTES} bytes limit)>"
+            placeholder_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
+            match_score = 0
+            snippet = ""
+            if cfg.query:
+                match_score, snippet = match_query_snippet(text, cfg.query)
+            sh = simhash64(text)
+            if cfg.dedup_bits > 0 and any(hamming(sh, h0) <= cfg.dedup_bits for h0 in sim_seen):
+                continue
+            sim_seen.append(sh)
+            candidates.append({
+                "path": f,
+                "sha256": placeholder_hash,
+                "summary": summarize(f, text, max_lines=10),
+                "text": text,
+                "simhash": sh,
+                "match_score": match_score,
+                "snippet": snippet,
+            })
+            candidate_hash[f] = placeholder_hash
+            continue
+
         try:
             h = hashlib.sha256()
             collected = bytearray()
@@ -82,4 +113,4 @@ def build_candidates(cfg, files: List[Path], root: Path, is_included, is_omitted
             candidates = matched
         candidates.sort(key=lambda rec: rec.get("match_score", 0), reverse=True)
 
-    return candidates, candidate_hash
+    return candidates, candidate_hash
diff --git a/src/dir2md/token.py b/src/dir2md/token.py
@@ -1,5 +1,12 @@
 from __future__ import annotations
 
+import math
+from functools import lru_cache
+
+
+@lru_cache(maxsize=2048)
 def estimate_tokens(text: str) -> int:
-    # Simple estimation: 4 chars ≈ 1 token
-    return max(1, (len(text) + 3)//4)
+    """Estimate token count using a rough 4 chars-per-token heuristic."""
+    if not text:
+        return 1
+    return max(1, math.ceil(len(text) / 4))