safishamsi · nuthalapativarun · Apr 9, 2026 · Apr 26, 2026 · Apr 28, 2026
diff --git a/graphify/__main__.py b/graphify/__main__.py
@@ -619,6 +619,7 @@ def main() -> None:
         print("    --type T                query type: query|path_query|explain (default: query)")
         print("    --nodes N1 N2 ...       source node labels cited in the answer")
         print("    --memory-dir DIR        memory directory (default: graphify-out/memory)")
+        print("  dry-run [path]          scan corpus and report file counts/health without building")
         print("  benchmark [graph.json]  measure token reduction vs naive full-corpus approach")
         print("  hook install            install post-commit/post-checkout git hooks (all platforms)")
         print("  hook uninstall          remove git hooks")
@@ -790,6 +791,43 @@ def main() -> None:
             source_nodes=opts.nodes or None,
         )
         print(f"Saved to {out}")
+    elif cmd == "dry-run":
+        root = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(".")
+        if not root.exists():
+            print(f"error: path not found: {root}", file=sys.stderr)
+            sys.exit(1)
+        from graphify.detect import detect as _detect
+        result = _detect(root, write_sidecars=False)
+        files = result["files"]
+        total_files = result["total_files"]
+        total_words = result["total_words"]
+        print(f"Corpus scan: {root.resolve()}")
+        print()
+        type_labels = {
+            "code": "Code files",
+            "document": "Documents",
+            "paper": "Papers/PDFs",
+            "image": "Images",
+        }
+        for ftype, label in type_labels.items():
+            count = len(files.get(ftype, []))
+            if count:
+                print(f"  {label:<16} {count:>5}")
+        print(f"  {'Total':<16} {total_files:>5}  (~{total_words:,} words)")
+        skipped = result.get("skipped_sensitive", [])
+        office_missing = [s for s in skipped if "office deps missing" in s]
+        sensitive = [s for s in skipped if "office deps missing" not in s]
+        if sensitive:
+            print(f"\n  Skipped (sensitive): {len(sensitive)} file(s)")
+        if office_missing:
+            print(f"\n  Skipped (office deps missing): {len(office_missing)} file(s)")
+            print("  Install office support: pip install graphify[office]")
+            print("  These files will not be extracted in a real run without the extras.")
+        if result.get("warning"):
+            print(f"\nwarning: {result['warning']}")
+        elif not office_missing:
+            print("\nCorpus looks healthy — no warnings.")
+        print("\nNo files were written. Run without dry-run to build the graph.")
     elif cmd == "benchmark":
         from graphify.benchmark import run_benchmark, print_benchmark
         graph_path = sys.argv[2] if len(sys.argv) > 2 else "graphify-out/graph.json"

diff --git a/graphify/detect.py b/graphify/detect.py
@@ -299,7 +299,7 @@ def _is_ignored(path: Path, root: Path, patterns: list[str]) -> bool:
     return False
 
 
-def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
+def detect(root: Path, *, follow_symlinks: bool = False, write_sidecars: bool = True) -> dict:
     files: dict[FileType, list[str]] = {
         FileType.CODE: [],
         FileType.DOCUMENT: [],
@@ -366,13 +366,34 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict:
         if ftype:
             # Office files: convert to markdown sidecar so subagents can read them
             if p.suffix.lower() in OFFICE_EXTENSIONS:
-                md_path = convert_office_file(p, converted_dir)
-                if md_path:
-                    files[ftype].append(str(md_path))
-                    total_words += count_words(md_path)
+                if write_sidecars:
+                    md_path = convert_office_file(p, converted_dir)
+                    if md_path:
+                        files[ftype].append(str(md_path))
+                        total_words += count_words(md_path)
+                    else:
+                        # Conversion failed (library not installed) - skip with note
+                        skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
                 else:
-                    # Conversion failed (library not installed) - skip with note
-                    skipped_sensitive.append(str(p) + " [office conversion failed - pip install graphifyy[office]]")
+                    # dry-run: no sidecar writes allowed.
+                    # Probe whether office deps are installed by attempting an
+                    # in-memory conversion. If the result is empty the real run
+                    # would also produce no content — surface a warning now.
+                    ext = p.suffix.lower()
+                    if ext == ".docx":
+                        probe = docx_to_markdown(p)
+                    elif ext == ".xlsx":
+                        probe = xlsx_to_markdown(p)
+                    else:
+                        probe = None  # unknown office type — count as-is
+
+                    if probe is not None and not probe.strip():
+                        skipped_sensitive.append(
+                            str(p) + " [office deps missing - pip install graphify[office]]"
+                        )
+                    else:
+                        files[ftype].append(str(p))
+                        total_words += count_words(p)
                 continue
             files[ftype].append(str(p))
             total_words += count_words(p)

diff --git a/tests/test_dry_run.py b/tests/test_dry_run.py
@@ -0,0 +1,90 @@
+"""Tests for the `graphify dry-run` CLI command."""
+import sys
+import pytest
+from pathlib import Path
+from unittest.mock import patch
+
+
+def _run_main(argv):
+    """Run graphify.__main__.main() with the given argv, capture stdout."""
+    import io
+    from graphify.__main__ import main
+    buf = io.StringIO()
+    exit_code = 0
+    with patch("sys.argv", argv), patch("sys.stdout", buf):
+        try:
+            main()
+        except SystemExit as e:
+            exit_code = e.code or 0
+    return buf.getvalue(), exit_code
+
+
+def test_dry_run_prints_summary(tmp_path):
+    """dry-run on a directory with code files prints a file-count summary."""
+    (tmp_path / "app.py").write_text("x = 1\n")
+    (tmp_path / "utils.py").write_text("def f(): pass\n")
+    out, code = _run_main(["graphify", "dry-run", str(tmp_path)])
+    assert code == 0
+    assert "Corpus scan" in out
+    assert "Code files" in out
+    assert "Total" in out
+
+
+def test_dry_run_no_files_written(tmp_path):
+    """dry-run must not create graphify-out/ or any output files."""
+    (tmp_path / "readme.md").write_text("# hello\n")
+    _run_main(["graphify", "dry-run", str(tmp_path)])
+    assert not (tmp_path / "graphify-out").exists()
+
+
+def test_dry_run_default_path(tmp_path, monkeypatch):
+    """dry-run with no path argument defaults to the current directory."""
+    (tmp_path / "main.py").write_text("print('hi')\n")
+    monkeypatch.chdir(tmp_path)
+    out, code = _run_main(["graphify", "dry-run"])
+    assert code == 0
+    assert "Corpus scan" in out
+
+
+def test_dry_run_missing_path(tmp_path):
+    """dry-run with a non-existent path exits non-zero."""
+    with pytest.raises(SystemExit) as exc:
+        with patch("sys.argv", ["graphify", "dry-run", str(tmp_path / "nonexistent")]):
+            from graphify.__main__ import main
+            main()
+    assert exc.value.code != 0
+
+
+def test_dry_run_no_graphify_out_written(tmp_path):
+    """dry-run output says no files were written."""
+    (tmp_path / "a.py").write_text("a = 1\n")
+    out, _ = _run_main(["graphify", "dry-run", str(tmp_path)])
+    assert "No files were written" in out
+
+
+def test_dry_run_office_no_sidecar_written(tmp_path):
+    """dry-run must not write office sidecars even when .docx/.xlsx files are present."""
+    from unittest.mock import MagicMock, patch as mpatch
+
+    # Create a fake .docx so detect sees it as an office file
+    (tmp_path / "report.docx").write_bytes(b"PK\x03\x04")  # minimal docx magic bytes
+
+    with mpatch("graphify.detect.convert_office_file") as mock_convert:
+        _run_main(["graphify", "dry-run", str(tmp_path)])
+
+    mock_convert.assert_not_called()
+
+
+def test_dry_run_office_missing_deps_warns(tmp_path):
+    """dry-run warns when office deps are missing and content would be empty in a real run."""
+    from unittest.mock import patch as mpatch
+
+    (tmp_path / "report.docx").write_bytes(b"PK\x03\x04")
+
+    # Simulate missing python-docx: docx_to_markdown returns ""
+    with mpatch("graphify.detect.docx_to_markdown", return_value=""):
+        out, code = _run_main(["graphify", "dry-run", str(tmp_path)])
+
+    assert code == 0
+    assert "office deps missing" in out.lower() or "office" in out.lower()
+    assert "pip install graphify[office]" in out