From 0007278c8b28faab9a7936bd5ac0c442fe8369b1 Mon Sep 17 00:00:00 2001
From: Varun Nuthalapati <nuthalapativarun@gmail.com>
Date: Thu, 9 Apr 2026 10:22:51 -0700
Subject: [PATCH 1/3] feat: add dry-run CLI command

graphify dry-run [path] scans the corpus with detect() and prints a
file-count table with corpus health warnings without writing any
output files or building the graph.
---
 graphify/__main__.py  | 31 ++++++++++++++++++++++
 tests/test_dry_run.py | 62 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 tests/test_dry_run.py

diff --git a/graphify/__main__.py b/graphify/__main__.py
index 604fa524..16b105fc 100644
--- a/graphify/__main__.py
+++ b/graphify/__main__.py
@@ -619,6 +619,7 @@ def main() -> None:
         print("    --type T                query type: query|path_query|explain (default: query)")
         print("    --nodes N1 N2 ...       source node labels cited in the answer")
         print("    --memory-dir DIR        memory directory (default: graphify-out/memory)")
+        print("  dry-run [path]          scan corpus and report file counts/health without building")
         print("  benchmark [graph.json]  measure token reduction vs naive full-corpus approach")
         print("  hook install            install post-commit/post-checkout git hooks (all platforms)")
         print("  hook uninstall          remove git hooks")
@@ -790,6 +791,36 @@ def main() -> None:
             source_nodes=opts.nodes or None,
         )
         print(f"Saved to {out}")
+    elif cmd == "dry-run":
+        root = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(".")
+        if not root.exists():
+            print(f"error: path not found: {root}", file=sys.stderr)
+            sys.exit(1)
+        from graphify.detect import detect as _detect
+        result = _detect(root)
+        files = result["files"]
+        total_files = result["total_files"]
+        total_words = result["total_words"]
+        print(f"Corpus scan: {root.resolve()}")
+        print()
+        type_labels = {
+            "code": "Code files",
+            "document": "Documents",
+            "paper": "Papers/PDFs",
+            "image": "Images",
+        }
+        for ftype, label in type_labels.items():
+            count = len(files.get(ftype, []))
+            if count:
+                print(f"  {label:<16} {count:>5}")
+        print(f"  {'Total':<16} {total_files:>5}  (~{total_words:,} words)")
+        if result.get("skipped_sensitive"):
+            print(f"\n  Skipped (sensitive): {len(result['skipped_sensitive'])} file(s)")
+        if result.get("warning"):
+            print(f"\nwarning: {result['warning']}")
+        else:
+            print("\nCorpus looks healthy — no warnings.")
+        print("\nNo files were written. Run without dry-run to build the graph.")
     elif cmd == "benchmark":
         from graphify.benchmark import run_benchmark, print_benchmark
         graph_path = sys.argv[2] if len(sys.argv) > 2 else "graphify-out/graph.json"
diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py
new file mode 100644
index 00000000..23d5a308
--- /dev/null
+++ b/tests/test_dry_run.py
@@ -0,0 +1,62 @@
+"""Tests for the `graphify dry-run` CLI command."""
+import sys
+import pytest
+from pathlib import Path
+from unittest.mock import patch
+
+
+def _run_main(argv):
+    """Run graphify.__main__.main() with the given argv, capture stdout."""
+    import io
+    from graphify.__main__ import main
+    buf = io.StringIO()
+    exit_code = 0
+    with patch("sys.argv", argv), patch("sys.stdout", buf):
+        try:
+            main()
+        except SystemExit as e:
+            exit_code = e.code or 0
+    return buf.getvalue(), exit_code
+
+
+def test_dry_run_prints_summary(tmp_path):
+    """dry-run on a directory with code files prints a file-count summary."""
+    (tmp_path / "app.py").write_text("x = 1\n")
+    (tmp_path / "utils.py").write_text("def f(): pass\n")
+    out, code = _run_main(["graphify", "dry-run", str(tmp_path)])
+    assert code == 0
+    assert "Corpus scan" in out
+    assert "Code files" in out
+    assert "Total" in out
+
+
+def test_dry_run_no_files_written(tmp_path):
+    """dry-run must not create graphify-out/ or any output files."""
+    (tmp_path / "readme.md").write_text("# hello\n")
+    _run_main(["graphify", "dry-run", str(tmp_path)])
+    assert not (tmp_path / "graphify-out").exists()
+
+
+def test_dry_run_default_path(tmp_path, monkeypatch):
+    """dry-run with no path argument defaults to the current directory."""
+    (tmp_path / "main.py").write_text("print('hi')\n")
+    monkeypatch.chdir(tmp_path)
+    out, code = _run_main(["graphify", "dry-run"])
+    assert code == 0
+    assert "Corpus scan" in out
+
+
+def test_dry_run_missing_path(tmp_path):
+    """dry-run with a non-existent path exits non-zero."""
+    with pytest.raises(SystemExit) as exc:
+        with patch("sys.argv", ["graphify", "dry-run", str(tmp_path / "nonexistent")]):
+            from graphify.__main__ import main
+            main()
+    assert exc.value.code != 0
+
+
+def test_dry_run_no_graphify_out_written(tmp_path):
+    """dry-run output says no files were written."""
+    (tmp_path / "a.py").write_text("a = 1\n")
+    out, _ = _run_main(["graphify", "dry-run", str(tmp_path)])
+    assert "No files were written" in out

From 0b3e6ebab263a6f6defa944f80783e508e565928 Mon Sep 17 00:00:00 2001
From: Varun Nuthalapati <nuthalapativarun@gmail.com>
Date: Sun, 26 Apr 2026 10:07:45 -0700
Subject: [PATCH 2/3] fix: skip office sidecar writes during dry-run

detect() now accepts write_sidecars=False; when disabled, office files
are counted directly without calling convert_office_file() or touching
graphify-out/converted/. The dry-run CLI branch passes this flag so the
no-write promise holds even for .docx/.xlsx corpora.

Adds test_dry_run_office_no_sidecar_written to assert convert_office_file
is never called during dry-run.
---
 graphify/__main__.py  |  2 +-
 graphify/detect.py    | 19 ++++++++++++-------
 tests/test_dry_run.py | 13 +++++++++++++
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/graphify/__main__.py b/graphify/__main__.py
index 16b105fc..af55bc49 100644
--- a/graphify/__main__.py
+++ b/graphify/__main__.py
@@ -797,7 +797,7 @@ def main() -> None:
             print(f"error: path not found: {root}", file=sys.stderr)
             sys.exit(1)
         from graphify.detect import detect as _detect
-        result = _detect(root)
+        result = _detect(root, write_sidecars=False)
         files = result["files"]
         total_files = result["total_files"]
         total_words = result["total_words"]
diff --git a/graphify/detect.py b/graphify/detect.py
index 9a5f16e0..6898f8d4 100644
--- a/graphify/detect.py
+++ b/graphify/detect.py
@@ -299,7 +299,7 @@ def _is_ignored(path: Path, root: Path, patterns: list[str]) -> bool:
     return False
 
 
-def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
+def detect(root: Path, *, follow_symlinks: bool = False, write_sidecars: bool = True) -> dict:
     files: dict[FileType, list[str]] = {
         FileType.CODE: [],
         FileType.DOCUMENT: [],
@@ -366,13 +366,18 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
         if ftype:
             # Office files: convert to markdown sidecar so subagents can read them
             if p.suffix.lower() in OFFICE_EXTENSIONS:
-                md_path = convert_office_file(p, converted_dir)
-                if md_path:
-                    files[ftype].append(str(md_path))
-                    total_words += count_words(md_path)
+                if write_sidecars:
+                    md_path = convert_office_file(p, converted_dir)
+                    if md_path:
+                        files[ftype].append(str(md_path))
+                        total_words += count_words(md_path)
+                    else:
+                        # Conversion failed (library not installed) - skip with note
+                        skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
                 else:
-                    # Conversion failed (library not installed) - skip with note
-                    skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
+                    # dry-run: count words directly without writing any files
+                    files[ftype].append(str(p))
+                    total_words += count_words(p)
                 continue
             files[ftype].append(str(p))
             total_words += count_words(p)
diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py
index 23d5a308..7743ec14 100644
--- a/tests/test_dry_run.py
+++ b/tests/test_dry_run.py
@@ -60,3 +60,16 @@ def test_dry_run_no_graphify_out_written(tmp_path):
     (tmp_path / "a.py").write_text("a = 1\n")
     out, _ = _run_main(["graphify", "dry-run", str(tmp_path)])
     assert "No files were written" in out
+
+
+def test_dry_run_office_no_sidecar_written(tmp_path):
+    """dry-run must not write office sidecars even when .docx/.xlsx files are present."""
+    from unittest.mock import MagicMock, patch as mpatch
+
+    # Create a fake .docx so detect sees it as an office file
+    (tmp_path / "report.docx").write_bytes(b"PK\x03\x04")  # minimal docx magic bytes
+
+    with mpatch("graphify.detect.convert_office_file") as mock_convert:
+        _run_main(["graphify", "dry-run", str(tmp_path)])
+
+    mock_convert.assert_not_called()

From feb29c3d34110c60ad1365fdd7d740f8181f30f0 Mon Sep 17 00:00:00 2001
From: Varun Nuthalapati <nuthalapativarun@gmail.com>
Date: Tue, 28 Apr 2026 08:36:40 -0700
Subject: [PATCH 3/3] fix: warn when office deps missing during dry-run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In write_sidecars=False mode, probe office files via docx_to_markdown/
xlsx_to_markdown (which return '' on ImportError). Empty result means
the real run would also extract nothing — add to skipped list with an
install hint instead of silently counting 0 words.

__main__.py surfaces a dedicated 'Skipped (office deps missing)' line
with pip install hint, and suppresses 'Corpus looks healthy' when
office files were skipped.

Adds test_dry_run_office_missing_deps_warns to assert the warning and
install hint appear when docx_to_markdown is patched to return ''.

Closes feedback from qodo-ai-reviewer on PR #157.
---
 graphify/__main__.py  | 13 ++++++++++---
 graphify/detect.py    | 22 +++++++++++++++++++---
 tests/test_dry_run.py | 15 +++++++++++++++
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/graphify/__main__.py b/graphify/__main__.py
index af55bc49..3f560b51 100644
--- a/graphify/__main__.py
+++ b/graphify/__main__.py
@@ -814,11 +814,18 @@ def main() -> None:
             if count:
                 print(f"  {label:<16} {count:>5}")
         print(f"  {'Total':<16} {total_files:>5}  (~{total_words:,} words)")
-        if result.get("skipped_sensitive"):
-            print(f"\n  Skipped (sensitive): {len(result['skipped_sensitive'])} file(s)")
+        skipped = result.get("skipped_sensitive", [])
+        office_missing = [s for s in skipped if "office deps missing" in s]
+        sensitive = [s for s in skipped if "office deps missing" not in s]
+        if sensitive:
+            print(f"\n  Skipped (sensitive): {len(sensitive)} file(s)")
+        if office_missing:
+            print(f"\n  Skipped (office deps missing): {len(office_missing)} file(s)")
+            print("  Install office support: pip install graphify[office]")
+            print("  These files will not be extracted in a real run without the extras.")
         if result.get("warning"):
             print(f"\nwarning: {result['warning']}")
-        else:
+        elif not office_missing:
             print("\nCorpus looks healthy — no warnings.")
         print("\nNo files were written. Run without dry-run to build the graph.")
     elif cmd == "benchmark":
diff --git a/graphify/detect.py b/graphify/detect.py
index 6898f8d4..591bc7e7 100644
--- a/graphify/detect.py
+++ b/graphify/detect.py
@@ -375,9 +375,25 @@ def detect(root: Path, *, follow_symlinks: bool = False, write_sidecars: bool =
                         # Conversion failed (library not installed) - skip with note
                         skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
                 else:
-                    # dry-run: count words directly without writing any files
-                    files[ftype].append(str(p))
-                    total_words += count_words(p)
+                    # dry-run: no sidecar writes allowed.
+                    # Probe whether office deps are installed by attempting an
+                    # in-memory conversion. If the result is empty the real run
+                    # would also produce no content — surface a warning now.
+                    ext = p.suffix.lower()
+                    if ext == ".docx":
+                        probe = docx_to_markdown(p)
+                    elif ext == ".xlsx":
+                        probe = xlsx_to_markdown(p)
+                    else:
+                        probe = None  # unknown office type — count as-is
+
+                    if probe is not None and not probe.strip():
+                        skipped_sensitive.append(
+                            str(p) + " [office deps missing - pip install graphify[office]]"
+                        )
+                    else:
+                        files[ftype].append(str(p))
+                        total_words += count_words(p)
                 continue
             files[ftype].append(str(p))
             total_words += count_words(p)
diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py
index 7743ec14..9265a74c 100644
--- a/tests/test_dry_run.py
+++ b/tests/test_dry_run.py
@@ -73,3 +73,18 @@ def test_dry_run_office_no_sidecar_written(tmp_path):
         _run_main(["graphify", "dry-run", str(tmp_path)])
 
     mock_convert.assert_not_called()
+
+
+def test_dry_run_office_missing_deps_warns(tmp_path):
+    """dry-run warns when office deps are missing and content would be empty in a real run."""
+    from unittest.mock import patch as mpatch
+
+    (tmp_path / "report.docx").write_bytes(b"PK\x03\x04")
+
+    # Simulate missing python-docx: docx_to_markdown returns ""
+    with mpatch("graphify.detect.docx_to_markdown", return_value=""):
+        out, code = _run_main(["graphify", "dry-run", str(tmp_path)])
+
+    assert code == 0
+    assert "office deps missing" in out.lower() or "office" in out.lower()
+    assert "pip install graphify[office]" in out