Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions graphify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,7 @@ def main() -> None:
print(" --type T query type: query|path_query|explain (default: query)")
print(" --nodes N1 N2 ... source node labels cited in the answer")
print(" --memory-dir DIR memory directory (default: graphify-out/memory)")
print(" dry-run [path] scan corpus and report file counts/health without building")
print(" benchmark [graph.json] measure token reduction vs naive full-corpus approach")
print(" hook install install post-commit/post-checkout git hooks (all platforms)")
print(" hook uninstall remove git hooks")
Expand Down Expand Up @@ -790,6 +791,43 @@ def main() -> None:
source_nodes=opts.nodes or None,
)
print(f"Saved to {out}")
elif cmd == "dry-run":
root = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(".")
if not root.exists():
print(f"error: path not found: {root}", file=sys.stderr)
sys.exit(1)
from graphify.detect import detect as _detect
result = _detect(root, write_sidecars=False)
files = result["files"]
total_files = result["total_files"]
total_words = result["total_words"]
print(f"Corpus scan: {root.resolve()}")
print()
type_labels = {
"code": "Code files",
"document": "Documents",
"paper": "Papers/PDFs",
"image": "Images",
}
for ftype, label in type_labels.items():
count = len(files.get(ftype, []))
if count:
print(f" {label:<16} {count:>5}")
print(f" {'Total':<16} {total_files:>5} (~{total_words:,} words)")
skipped = result.get("skipped_sensitive", [])
office_missing = [s for s in skipped if "office deps missing" in s]
sensitive = [s for s in skipped if "office deps missing" not in s]
if sensitive:
print(f"\n Skipped (sensitive): {len(sensitive)} file(s)")
if office_missing:
print(f"\n Skipped (office deps missing): {len(office_missing)} file(s)")
print(" Install office support: pip install graphify[office]")
print(" These files will not be extracted in a real run without the extras.")
if result.get("warning"):
print(f"\nwarning: {result['warning']}")
elif not office_missing:
print("\nCorpus looks healthy — no warnings.")
print("\nNo files were written. Run without dry-run to build the graph.")
elif cmd == "benchmark":
from graphify.benchmark import run_benchmark, print_benchmark
graph_path = sys.argv[2] if len(sys.argv) > 2 else "graphify-out/graph.json"
Expand Down
35 changes: 28 additions & 7 deletions graphify/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def _is_ignored(path: Path, root: Path, patterns: list[str]) -> bool:
return False


def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
def detect(root: Path, *, follow_symlinks: bool = False, write_sidecars: bool = True) -> dict:
files: dict[FileType, list[str]] = {
FileType.CODE: [],
FileType.DOCUMENT: [],
Expand Down Expand Up @@ -366,13 +366,34 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
if ftype:
# Office files: convert to markdown sidecar so subagents can read them
if p.suffix.lower() in OFFICE_EXTENSIONS:
md_path = convert_office_file(p, converted_dir)
if md_path:
files[ftype].append(str(md_path))
total_words += count_words(md_path)
if write_sidecars:
md_path = convert_office_file(p, converted_dir)
if md_path:
files[ftype].append(str(md_path))
total_words += count_words(md_path)
else:
# Conversion failed (library not installed) - skip with note
skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
else:
# Conversion failed (library not installed) - skip with note
skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
# dry-run: no sidecar writes allowed.
# Probe whether office deps are installed by attempting an
# in-memory conversion. If the result is empty the real run
# would also produce no content — surface a warning now.
ext = p.suffix.lower()
if ext == ".docx":
probe = docx_to_markdown(p)
elif ext == ".xlsx":
probe = xlsx_to_markdown(p)
else:
probe = None # unknown office type — count as-is

if probe is not None and not probe.strip():
skipped_sensitive.append(
str(p) + " [office deps missing - pip install graphify[office]]"
)
else:
files[ftype].append(str(p))
total_words += count_words(p)
continue
files[ftype].append(str(p))
total_words += count_words(p)
Expand Down
90 changes: 90 additions & 0 deletions tests/test_dry_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Tests for the `graphify dry-run` CLI command."""
import sys
import pytest
from pathlib import Path
from unittest.mock import patch


def _run_main(argv):
"""Run graphify.__main__.main() with the given argv, capture stdout."""
import io
from graphify.__main__ import main
buf = io.StringIO()
exit_code = 0
with patch("sys.argv", argv), patch("sys.stdout", buf):
try:
main()
except SystemExit as e:
exit_code = e.code or 0
return buf.getvalue(), exit_code


def test_dry_run_prints_summary(tmp_path):
"""dry-run on a directory with code files prints a file-count summary."""
(tmp_path / "app.py").write_text("x = 1\n")
(tmp_path / "utils.py").write_text("def f(): pass\n")
out, code = _run_main(["graphify", "dry-run", str(tmp_path)])
assert code == 0
assert "Corpus scan" in out
assert "Code files" in out
assert "Total" in out


def test_dry_run_no_files_written(tmp_path):
"""dry-run must not create graphify-out/ or any output files."""
(tmp_path / "readme.md").write_text("# hello\n")
_run_main(["graphify", "dry-run", str(tmp_path)])
assert not (tmp_path / "graphify-out").exists()


def test_dry_run_default_path(tmp_path, monkeypatch):
"""dry-run with no path argument defaults to the current directory."""
(tmp_path / "main.py").write_text("print('hi')\n")
monkeypatch.chdir(tmp_path)
out, code = _run_main(["graphify", "dry-run"])
assert code == 0
assert "Corpus scan" in out


def test_dry_run_missing_path(tmp_path):
"""dry-run with a non-existent path exits non-zero."""
with pytest.raises(SystemExit) as exc:
with patch("sys.argv", ["graphify", "dry-run", str(tmp_path / "nonexistent")]):
from graphify.__main__ import main
main()
assert exc.value.code != 0


def test_dry_run_no_graphify_out_written(tmp_path):
"""dry-run output says no files were written."""
(tmp_path / "a.py").write_text("a = 1\n")
out, _ = _run_main(["graphify", "dry-run", str(tmp_path)])
assert "No files were written" in out


def test_dry_run_office_no_sidecar_written(tmp_path):
"""dry-run must not write office sidecars even when .docx/.xlsx files are present."""
from unittest.mock import MagicMock, patch as mpatch

# Create a fake .docx so detect sees it as an office file
(tmp_path / "report.docx").write_bytes(b"PK\x03\x04") # minimal docx magic bytes

with mpatch("graphify.detect.convert_office_file") as mock_convert:
_run_main(["graphify", "dry-run", str(tmp_path)])

mock_convert.assert_not_called()


def test_dry_run_office_missing_deps_warns(tmp_path):
"""dry-run warns when office deps are missing and content would be empty in a real run."""
from unittest.mock import patch as mpatch

(tmp_path / "report.docx").write_bytes(b"PK\x03\x04")

# Simulate missing python-docx: docx_to_markdown returns ""
with mpatch("graphify.detect.docx_to_markdown", return_value=""):
out, code = _run_main(["graphify", "dry-run", str(tmp_path)])

assert code == 0
assert "office deps missing" in out.lower() or "office" in out.lower()
assert "pip install graphify[office]" in out