Skip to content

Commit b4c39c3

Browse files
author
Flamehaven CI
committed
Release 1.1.2
1 parent f110345 commit b4c39c3

File tree

8 files changed

+104
-28
lines changed

8 files changed

+104
-28
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.1.2] - 2025-12-09
9+
10+
### Security
11+
- Masking now pre-compiles basic/advanced regexes and skips processing when input exceeds a safe threshold to reduce ReDoS risk.
12+
- Large individual files are skipped before read when they exceed 1MB, preventing OOM/hangs while still noting the skip.
13+
14+
### Performance
15+
- Token estimation is cached with LRU (maxsize 2048) and keeps a minimum of one token for empty strings.
16+
17+
### Tests
18+
- Pytest suite: 22 passed, 2 skipped.
19+
820
## [1.1.1] - 2025-12-04
921

1022
### Removed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ Dir2md converts directory structures into AI-friendly markdown with intelligent
2020

2121
**New to Dir2md?** Check out **[Wiki.md](docs/Wiki.md)** for a friendly introduction with examples.
2222

23+
## Fresh highlights (1.1.2)
24+
- Masking regexes are pre-compiled with a large-input guard to reduce ReDoS risk.
25+
- Single-file reads are capped at 1MB and cleanly skipped with warnings to avoid OOM/hangs.
26+
- Token estimation is now LRU-cached (minimum 1 token) for faster repeated calculations.
27+
2328
### Try Online
2429
[**Dir2md Demo on Hugging Face Spaces**](https://huggingface.co/spaces/Flamehaven/dir2md-demo) — No installation required
2530

demo/README.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,10 @@ Convert any public GitHub repository into an LLM-ready markdown blueprint plus o
3232
2) Choose options: include contents, emit manifest, enable spicy/strict.
3333
3) Run and download the markdown/JSONL outputs.
3434

35-
## Fresh highlights (1.1.0)
36-
- `--fast` preset (tree + manifest only, no file reads).
37-
- Default dual outputs (md + jsonl) for human + LLM.
38-
- Spicy risk report (`--spicy`, `--spicy-strict`) with 5 severity levels.
39-
- Modular pipeline (`walker`, `selector`, `renderer`, `orchestrator`) for cleaner extensibility.
35+
## Fresh highlights (1.1.2)
36+
- Masking regex pre-compilation with a large-input guard to reduce ReDoS risk.
37+
- Single-file read cap at 1MB with skip warnings to avoid OOM/hangs.
38+
- Token estimation now LRU-cached (minimum 1 token) for faster repeated calculations.
4039

4140
## Notes
4241
- Current Gradio SDK: **5.45.0**. A newer **6.0.2** is available; update `sdk_version` and `gradio` pin if you want to try it.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "dir2md"
7-
version = "1.1.0"
7+
version = "1.1.2"
88
description = "Generate a Markdown blueprint: directory tree + optional file contents (token-optimized, ICEBERG preset)"
99
readme = "README.md"
1010
authors = [{name = "Flamehaven", email = "info@flamehaven.space"}]

src/dir2md/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
from .core import Config, generate_markdown_report
44

55
__all__ = ["__version__", "apply_masking", "Config", "generate_markdown_report"]
6-
__version__ = "1.1.0"
6+
__version__ = "1.1.2"

src/dir2md/masking.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1-
import re
2-
from typing import Iterable
1+
import re
2+
from typing import Iterable, Dict, Pattern
3+
4+
# Guard extremely large inputs from expensive regex processing (ReDoS safety)
5+
MAX_MASK_INPUT_CHARS = 1_000_000
36

47
# Basic masking rules (available in OSS version)
58
BASIC_MASKING_RULES = {
@@ -28,38 +31,57 @@
2831
PRO_MASK_REPLACEMENT = "[*** MASKED_SECRET_PRO ***]"
2932
CUSTOM_MASK_REPLACEMENT = "[*** MASKED_SECRET ***]"
3033

31-
def get_active_masking_rules(mode: str = "basic"):
32-
"""Get masking rules; advanced mode always allowed in OSS build."""
33-
rules = BASIC_MASKING_RULES.copy()
34+
# Pre-compiled regex cache for faster and safer masking
35+
_COMPILED_RULES: Dict[str, Dict[str, Pattern[str]]] = {"basic": {}, "advanced": {}}
36+
37+
38+
def _compile_rules() -> None:
39+
"""Compile masking regex patterns at import time."""
40+
for name, pattern in BASIC_MASKING_RULES.items():
41+
flags = re.DOTALL if name == "PRIVATE_KEY" else 0
42+
_COMPILED_RULES["basic"][name] = re.compile(pattern, flags=flags)
43+
44+
_COMPILED_RULES["advanced"].update(_COMPILED_RULES["basic"])
45+
for name, pattern in ADVANCED_MASKING_RULES.items():
46+
_COMPILED_RULES["advanced"][name] = re.compile(pattern)
47+
48+
49+
_compile_rules()
3450

51+
52+
def get_active_masking_rules(mode: str = "basic") -> dict[str, Pattern[str]]:
53+
"""Return compiled masking rules for the requested mode."""
3554
if mode == "advanced":
36-
rules.update(ADVANCED_MASKING_RULES)
55+
return _COMPILED_RULES["advanced"]
56+
return _COMPILED_RULES["basic"]
3757

38-
return rules
3958

4059
def apply_masking(text: str, mode: str = "basic", custom_patterns: Iterable[str] | None = None) -> str:
4160
"""Apply masking rules with optional custom patterns (no license gating)."""
4261

62+
if not text:
63+
return text
64+
4365
if mode == "off":
44-
rules = {}
66+
rules: dict[str, Pattern[str]] = {}
4567
else:
4668
rules = get_active_masking_rules(mode)
47-
69+
70+
# Anti-ReDoS: avoid running regexes on extremely large payloads
71+
if len(text) > MAX_MASK_INPUT_CHARS:
72+
return text
73+
4874
# Apply custom patterns first so project-specific rules run before bundled ones.
4975
if custom_patterns:
5076
for pattern in custom_patterns:
5177
try:
52-
text = re.sub(pattern, CUSTOM_MASK_REPLACEMENT, text, flags=re.DOTALL)
78+
compiled = re.compile(pattern, flags=re.DOTALL)
79+
text = compiled.sub(CUSTOM_MASK_REPLACEMENT, text)
5380
except re.error as exc:
5481
print(f"[WARN] Skipping invalid custom mask pattern: {pattern!r} ({exc})")
5582

5683
for rule_name, pattern in rules.items():
5784
replacement = PRO_MASK_REPLACEMENT if rule_name in ADVANCED_MASKING_RULES else MASK_REPLACEMENT
58-
# Use DOTALL flag for private keys to match across newlines
59-
if rule_name == "PRIVATE_KEY":
60-
text = re.sub(pattern, replacement, text, flags=re.DOTALL)
61-
else:
62-
text = re.sub(pattern, replacement, text)
63-
64-
return text
65-
85+
text = pattern.sub(replacement, text)
86+
87+
return text

src/dir2md/selector.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from .summary import summarize
1111
from .search import match_query_snippet
1212

13+
SINGLE_FILE_MAX_BYTES = 1 * 1024 * 1024 # 1MB guard per file
14+
1315

1416
def build_candidates(cfg, files: List[Path], root: Path, is_included, is_omitted) -> Tuple[List[dict], Dict[Path, str]]:
1517
candidates: list[dict] = []
@@ -35,6 +37,35 @@ def build_candidates(cfg, files: List[Path], root: Path, is_included, is_omitted
3537
except ValueError:
3638
continue
3739

40+
try:
41+
size = f.stat().st_size
42+
except OSError:
43+
continue
44+
45+
if size > SINGLE_FILE_MAX_BYTES:
46+
print(f"[WARN] Skipping {f} ({size} bytes > {SINGLE_FILE_MAX_BYTES} bytes limit)")
47+
text = f"<Skipped: File too large ({size} bytes > {SINGLE_FILE_MAX_BYTES} bytes limit)>"
48+
placeholder_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
49+
match_score = 0
50+
snippet = ""
51+
if cfg.query:
52+
match_score, snippet = match_query_snippet(text, cfg.query)
53+
sh = simhash64(text)
54+
if cfg.dedup_bits > 0 and any(hamming(sh, h0) <= cfg.dedup_bits for h0 in sim_seen):
55+
continue
56+
sim_seen.append(sh)
57+
candidates.append({
58+
"path": f,
59+
"sha256": placeholder_hash,
60+
"summary": summarize(f, text, max_lines=10),
61+
"text": text,
62+
"simhash": sh,
63+
"match_score": match_score,
64+
"snippet": snippet,
65+
})
66+
candidate_hash[f] = placeholder_hash
67+
continue
68+
3869
try:
3970
h = hashlib.sha256()
4071
collected = bytearray()
@@ -82,4 +113,4 @@ def build_candidates(cfg, files: List[Path], root: Path, is_included, is_omitted
82113
candidates = matched
83114
candidates.sort(key=lambda rec: rec.get("match_score", 0), reverse=True)
84115

85-
return candidates, candidate_hash
116+
return candidates, candidate_hash

src/dir2md/token.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
from __future__ import annotations
22

3+
import math
4+
from functools import lru_cache
5+
6+
7+
@lru_cache(maxsize=2048)
38
def estimate_tokens(text: str) -> int:
4-
# Simple estimation: 4 chars ≈ 1 token
5-
return max(1, (len(text) + 3)//4)
9+
"""Estimate token count using a rough 4 chars-per-token heuristic."""
10+
if not text:
11+
return 1
12+
return max(1, math.ceil(len(text) / 4))

0 commit comments

Comments
 (0)