From 0007278c8b28faab9a7936bd5ac0c442fe8369b1 Mon Sep 17 00:00:00 2001 From: Varun Nuthalapati Date: Thu, 9 Apr 2026 10:22:51 -0700 Subject: [PATCH 1/3] feat: add dry-run CLI command graphify dry-run [path] scans the corpus with detect() and prints a file-count table with corpus health warnings without writing any output files or building the graph. --- graphify/__main__.py | 31 ++++++++++++++++++++++ tests/test_dry_run.py | 62 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 tests/test_dry_run.py diff --git a/graphify/__main__.py b/graphify/__main__.py index 604fa524..16b105fc 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -619,6 +619,7 @@ def main() -> None: print(" --type T query type: query|path_query|explain (default: query)") print(" --nodes N1 N2 ... source node labels cited in the answer") print(" --memory-dir DIR memory directory (default: graphify-out/memory)") + print(" dry-run [path] scan corpus and report file counts/health without building") print(" benchmark [graph.json] measure token reduction vs naive full-corpus approach") print(" hook install install post-commit/post-checkout git hooks (all platforms)") print(" hook uninstall remove git hooks") @@ -790,6 +791,36 @@ def main() -> None: source_nodes=opts.nodes or None, ) print(f"Saved to {out}") + elif cmd == "dry-run": + root = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(".") + if not root.exists(): + print(f"error: path not found: {root}", file=sys.stderr) + sys.exit(1) + from graphify.detect import detect as _detect + result = _detect(root) + files = result["files"] + total_files = result["total_files"] + total_words = result["total_words"] + print(f"Corpus scan: {root.resolve()}") + print() + type_labels = { + "code": "Code files", + "document": "Documents", + "paper": "Papers/PDFs", + "image": "Images", + } + for ftype, label in type_labels.items(): + count = len(files.get(ftype, [])) + if count: + print(f" {label:<16} {count:>5}") + print(f" {'Total':<16} {total_files:>5} (~{total_words:,} words)") + if result.get("skipped_sensitive"): + print(f"\n Skipped (sensitive): {len(result['skipped_sensitive'])} file(s)") + if result.get("warning"): + print(f"\nwarning: {result['warning']}") + else: + print("\nCorpus looks healthy — no warnings.") + print("\nNo files were written. Run without dry-run to build the graph.") elif cmd == "benchmark": from graphify.benchmark import run_benchmark, print_benchmark graph_path = sys.argv[2] if len(sys.argv) > 2 else "graphify-out/graph.json" diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py new file mode 100644 index 00000000..23d5a308 --- /dev/null +++ b/tests/test_dry_run.py @@ -0,0 +1,62 @@ +"""Tests for the `graphify dry-run` CLI command.""" +import sys +import pytest +from pathlib import Path +from unittest.mock import patch + + +def _run_main(argv): + """Run graphify.__main__.main() with the given argv, capture stdout.""" + import io + from graphify.__main__ import main + buf = io.StringIO() + exit_code = 0 + with patch("sys.argv", argv), patch("sys.stdout", buf): + try: + main() + except SystemExit as e: + exit_code = e.code or 0 + return buf.getvalue(), exit_code + + +def test_dry_run_prints_summary(tmp_path): + """dry-run on a directory with code files prints a file-count summary.""" + (tmp_path / "app.py").write_text("x = 1\n") + (tmp_path / "utils.py").write_text("def f(): pass\n") + out, code = _run_main(["graphify", "dry-run", str(tmp_path)]) + assert code == 0 + assert "Corpus scan" in out + assert "Code files" in out + assert "Total" in out + + +def test_dry_run_no_files_written(tmp_path): + """dry-run must not create graphify-out/ or any output files.""" + (tmp_path / "readme.md").write_text("# hello\n") + _run_main(["graphify", "dry-run", str(tmp_path)]) + assert not (tmp_path / "graphify-out").exists() + + +def test_dry_run_default_path(tmp_path, monkeypatch): + """dry-run with no path argument defaults to the current directory.""" + (tmp_path / "main.py").write_text("print('hi')\n") + monkeypatch.chdir(tmp_path) + out, code = _run_main(["graphify", "dry-run"]) + assert code == 0 + assert "Corpus scan" in out + + +def test_dry_run_missing_path(tmp_path): + """dry-run with a non-existent path exits non-zero.""" + with pytest.raises(SystemExit) as exc: + with patch("sys.argv", ["graphify", "dry-run", str(tmp_path / "nonexistent")]): + from graphify.__main__ import main + main() + assert exc.value.code != 0 + + +def test_dry_run_no_graphify_out_written(tmp_path): + """dry-run output says no files were written.""" + (tmp_path / "a.py").write_text("a = 1\n") + out, _ = _run_main(["graphify", "dry-run", str(tmp_path)]) + assert "No files were written" in out From 0b3e6ebab263a6f6defa944f80783e508e565928 Mon Sep 17 00:00:00 2001 From: Varun Nuthalapati Date: Sun, 26 Apr 2026 10:07:45 -0700 Subject: [PATCH 2/3] fix: skip office sidecar writes during dry-run detect() now accepts write_sidecars=False; when disabled, office files are counted directly without calling convert_office_file() or touching graphify-out/converted/. The dry-run CLI branch passes this flag so the no-write promise holds even for .docx/.xlsx corpora. Adds test_dry_run_office_no_sidecar_written to assert convert_office_file is never called during dry-run. --- graphify/__main__.py | 2 +- graphify/detect.py | 19 ++++++++++++------- tests/test_dry_run.py | 13 +++++++++++++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/graphify/__main__.py b/graphify/__main__.py index 16b105fc..af55bc49 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -797,7 +797,7 @@ def main() -> None: print(f"error: path not found: {root}", file=sys.stderr) sys.exit(1) from graphify.detect import detect as _detect - result = _detect(root) + result = _detect(root, write_sidecars=False) files = result["files"] total_files = result["total_files"] total_words = result["total_words"] diff --git a/graphify/detect.py b/graphify/detect.py index 9a5f16e0..6898f8d4 100644 --- a/graphify/detect.py +++ b/graphify/detect.py @@ -299,7 +299,7 @@ def _is_ignored(path: Path, root: Path, patterns: list[str]) -> bool: return False -def detect(root: Path, *, follow_symlinks: bool = False) -> dict: +def detect(root: Path, *, follow_symlinks: bool = False, write_sidecars: bool = True) -> dict: files: dict[FileType, list[str]] = { FileType.CODE: [], FileType.DOCUMENT: [], @@ -366,13 +366,18 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict: if ftype: # Office files: convert to markdown sidecar so subagents can read them if p.suffix.lower() in OFFICE_EXTENSIONS: - md_path = convert_office_file(p, converted_dir) - if md_path: - files[ftype].append(str(md_path)) - total_words += count_words(md_path) + if write_sidecars: + md_path = convert_office_file(p, converted_dir) + if md_path: + files[ftype].append(str(md_path)) + total_words += count_words(md_path) + else: + # Conversion failed (library not installed) - skip with note + skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]") else: - # Conversion failed (library not installed) - skip with note - skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]") + # dry-run: count words directly without writing any files + files[ftype].append(str(p)) + total_words += count_words(p) continue files[ftype].append(str(p)) total_words += count_words(p) diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py index 23d5a308..7743ec14 100644 --- a/tests/test_dry_run.py +++ b/tests/test_dry_run.py @@ -60,3 +60,16 @@ def test_dry_run_no_graphify_out_written(tmp_path): (tmp_path / "a.py").write_text("a = 1\n") out, _ = _run_main(["graphify", "dry-run", str(tmp_path)]) assert "No files were written" in out + + +def test_dry_run_office_no_sidecar_written(tmp_path): + """dry-run must not write office sidecars even when .docx/.xlsx files are present.""" + from unittest.mock import MagicMock, patch as mpatch + + # Create a fake .docx so detect sees it as an office file + (tmp_path / "report.docx").write_bytes(b"PK\x03\x04") # minimal docx magic bytes + + with mpatch("graphify.detect.convert_office_file") as mock_convert: + _run_main(["graphify", "dry-run", str(tmp_path)]) + + mock_convert.assert_not_called() From feb29c3d34110c60ad1365fdd7d740f8181f30f0 Mon Sep 17 00:00:00 2001 From: Varun Nuthalapati Date: Tue, 28 Apr 2026 08:36:40 -0700 Subject: [PATCH 3/3] fix: warn when office deps missing during dry-run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In write_sidecars=False mode, probe office files via docx_to_markdown/ xlsx_to_markdown (which return '' on ImportError). Empty result means the real run would also extract nothing — add to skipped list with an install hint instead of silently counting 0 words. __main__.py surfaces a dedicated 'Skipped (office deps missing)' line with pip install hint, and suppresses 'Corpus looks healthy' when office files were skipped. Adds test_dry_run_office_missing_deps_warns to assert the warning and install hint appear when docx_to_markdown is patched to return ''. Closes feedback from qodo-ai-reviewer on PR #157. --- graphify/__main__.py | 13 ++++++++++--- graphify/detect.py | 22 +++++++++++++++++++--- tests/test_dry_run.py | 15 +++++++++++++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/graphify/__main__.py b/graphify/__main__.py index af55bc49..3f560b51 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -814,11 +814,18 @@ def main() -> None: if count: print(f" {label:<16} {count:>5}") print(f" {'Total':<16} {total_files:>5} (~{total_words:,} words)") - if result.get("skipped_sensitive"): - print(f"\n Skipped (sensitive): {len(result['skipped_sensitive'])} file(s)") + skipped = result.get("skipped_sensitive", []) + office_missing = [s for s in skipped if "office deps missing" in s] + sensitive = [s for s in skipped if "office deps missing" not in s] + if sensitive: + print(f"\n Skipped (sensitive): {len(sensitive)} file(s)") + if office_missing: + print(f"\n Skipped (office deps missing): {len(office_missing)} file(s)") + print(" Install office support: pip install graphify[office]") + print(" These files will not be extracted in a real run without the extras.") if result.get("warning"): print(f"\nwarning: {result['warning']}") - else: + elif not office_missing: print("\nCorpus looks healthy — no warnings.") print("\nNo files were written. Run without dry-run to build the graph.") elif cmd == "benchmark": diff --git a/graphify/detect.py b/graphify/detect.py index 6898f8d4..591bc7e7 100644 --- a/graphify/detect.py +++ b/graphify/detect.py @@ -375,9 +375,25 @@ def detect(root: Path, *, follow_symlinks: bool = False, write_sidecars: bool = # Conversion failed (library not installed) - skip with note skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]") else: - # dry-run: count words directly without writing any files - files[ftype].append(str(p)) - total_words += count_words(p) + # dry-run: no sidecar writes allowed. + # Probe whether office deps are installed by attempting an + # in-memory conversion. If the result is empty the real run + # would also produce no content — surface a warning now. + ext = p.suffix.lower() + if ext == ".docx": + probe = docx_to_markdown(p) + elif ext == ".xlsx": + probe = xlsx_to_markdown(p) + else: + probe = None # unknown office type — count as-is + + if probe is not None and not probe.strip(): + skipped_sensitive.append( + str(p) + " [office deps missing - pip install graphify[office]]" + ) + else: + files[ftype].append(str(p)) + total_words += count_words(p) continue files[ftype].append(str(p)) total_words += count_words(p) diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py index 7743ec14..9265a74c 100644 --- a/tests/test_dry_run.py +++ b/tests/test_dry_run.py @@ -73,3 +73,18 @@ def test_dry_run_office_no_sidecar_written(tmp_path): _run_main(["graphify", "dry-run", str(tmp_path)]) mock_convert.assert_not_called() + + +def test_dry_run_office_missing_deps_warns(tmp_path): + """dry-run warns when office deps are missing and content would be empty in a real run.""" + from unittest.mock import patch as mpatch + + (tmp_path / "report.docx").write_bytes(b"PK\x03\x04") + + # Simulate missing python-docx: docx_to_markdown returns "" + with mpatch("graphify.detect.docx_to_markdown", return_value=""): + out, code = _run_main(["graphify", "dry-run", str(tmp_path)]) + + assert code == 0 + assert "office deps missing" in out.lower() or "office" in out.lower() + assert "pip install graphify[office]" in out