diff --git a/graphify/__main__.py b/graphify/__main__.py index be14274f..32e9846a 100644 --- a/graphify/__main__.py +++ b/graphify/__main__.py @@ -7,6 +7,8 @@ import sys from pathlib import Path +from . import paths as _paths + try: from importlib.metadata import version as _pkg_version __version__ = _pkg_version("graphifyy") @@ -14,6 +16,25 @@ __version__ = "unknown" +# Word-boundary substitution so install/print embeds the active GRAPHIFY_HOME +# in every host-config template (settings.json, CLAUDE.md, AGENTS.md, skills, ...). +_GRAPHIFY_OUT_TOKEN = re.compile(r"\bgraphify-out\b") + + +def _render(template: str) -> str: + return _GRAPHIFY_OUT_TOKEN.sub(_paths.home_name(), template) + + +def _install_skill_file(src: Path, dst: Path) -> None: + dst.parent.mkdir(parents=True, exist_ok=True) + dst.write_text(_render(src.read_text(encoding="utf-8")), encoding="utf-8") + + +def _maybe_migrate(project_dir: Path | None = None) -> None: + if _paths.auto_migrate(project_dir or Path(".")): + print(f" migrated legacy graphify-out/ -> {_paths.home_name()}/") + + def _check_skill_version(skill_dst: Path) -> None: """Warn if the installed skill is from an older graphify version.""" version_file = skill_dst.parent / ".graphify_version" @@ -36,19 +57,22 @@ def _refresh_all_version_stamps() -> None: if vf.exists(): vf.write_text(__version__, encoding="utf-8") -_SETTINGS_HOOK = { - "matcher": "Glob|Grep", - "hooks": [ - { - "type": "command", - "command": ( - "[ -f graphify-out/graph.json ] && " - r"""echo '{"hookSpecificOutput":{"hookEventName":"PreToolUse","additionalContext":"graphify: Knowledge graph exists. Read graphify-out/GRAPH_REPORT.md for god nodes and community structure before searching raw files."}}' """ - "|| true" - ), - } - ], -} +def _build_settings_hook() -> dict: + """Claude PreToolUse hook payload using the active home dir.""" + home = _paths.home_name() + return { + "matcher": "Glob|Grep", + "hooks": [ + { + "type": "command", + "command": ( + f"[ -f {home}/graph.json ] && " + f"""echo '{{"hookSpecificOutput":{{"hookEventName":"PreToolUse","additionalContext":"graphify: Knowledge graph exists. Read {home}/GRAPH_REPORT.md for god nodes and community structure before searching raw files."}}}}' """ + "|| true" + ), + } + ], + } _SKILL_REGISTRATION = ( "\n# graphify\n" @@ -129,6 +153,7 @@ def _refresh_all_version_stamps() -> None: def install(platform: str = "claude") -> None: + _maybe_migrate() if platform == "gemini": gemini_install() return @@ -154,8 +179,7 @@ def install(platform: str = "claude") -> None: skill_dst = _claude_base / "skills" / "graphify" / "SKILL.md" else: skill_dst = Path.home() / cfg["skill_dst"] - skill_dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(skill_src, skill_dst) + _install_skill_file(skill_src, skill_dst) (skill_dst.parent / ".graphify_version").write_text(__version__, encoding="utf-8") print(f" skill installed -> {skill_dst}") @@ -232,23 +256,27 @@ def install(platform: str = "claude") -> None: _GEMINI_MD_MARKER = "## graphify" -_GEMINI_HOOK = { - "matcher": "read_file|list_directory", - "hooks": [ - { - "type": "command", - "command": ( - "[ -f graphify-out/graph.json ] && " - r"""echo '{"decision":"allow","additionalContext":"graphify: Knowledge graph exists. Read graphify-out/GRAPH_REPORT.md for god nodes and community structure before searching raw files."}' """ - r"""|| echo '{"decision":"allow"}'""" - ), - } - ], -} +def _build_gemini_hook() -> dict: + """Gemini BeforeTool hook payload using the active home dir.""" + home = _paths.home_name() + return { + "matcher": "read_file|list_directory", + "hooks": [ + { + "type": "command", + "command": ( + f"[ -f {home}/graph.json ] && " + f"""echo '{{"decision":"allow","additionalContext":"graphify: Knowledge graph exists. Read {home}/GRAPH_REPORT.md for god nodes and community structure before searching raw files."}}' """ + r"""|| echo '{"decision":"allow"}'""" + ), + } + ], + } def gemini_install(project_dir: Path | None = None) -> None: """Copy skill file to ~/.gemini/skills/graphify/, write GEMINI.md section, and install BeforeTool hook.""" + _maybe_migrate(project_dir) # Copy skill file to ~/.gemini/skills/graphify/SKILL.md # On Windows, Gemini CLI prioritises ~/.agents/skills/ over ~/.gemini/skills/ skill_src = Path(__file__).parent / "skill.md" @@ -256,22 +284,22 @@ def gemini_install(project_dir: Path | None = None) -> None: skill_dst = Path.home() / ".agents" / "skills" / "graphify" / "SKILL.md" else: skill_dst = Path.home() / ".gemini" / "skills" / "graphify" / "SKILL.md" - skill_dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(skill_src, skill_dst) + _install_skill_file(skill_src, skill_dst) (skill_dst.parent / ".graphify_version").write_text(__version__, encoding="utf-8") print(f" skill installed -> {skill_dst}") target = (project_dir or Path(".")) / "GEMINI.md" + section = _render(_GEMINI_MD_SECTION) if target.exists(): content = target.read_text(encoding="utf-8") if _GEMINI_MD_MARKER in content: print("graphify already configured in GEMINI.md") else: - target.write_text(content.rstrip() + "\n\n" + _GEMINI_MD_SECTION, encoding="utf-8") + target.write_text(content.rstrip() + "\n\n" + section, encoding="utf-8") print(f"graphify section written to {target.resolve()}") else: - target.write_text(_GEMINI_MD_SECTION, encoding="utf-8") + target.write_text(section, encoding="utf-8") print(f"graphify section written to {target.resolve()}") _install_gemini_hook(project_dir or Path(".")) @@ -289,7 +317,7 @@ def _install_gemini_hook(project_dir: Path) -> None: settings = {} before_tool = settings.setdefault("hooks", {}).setdefault("BeforeTool", []) settings["hooks"]["BeforeTool"] = [h for h in before_tool if "graphify" not in str(h)] - settings["hooks"]["BeforeTool"].append(_GEMINI_HOOK) + settings["hooks"]["BeforeTool"].append(_build_gemini_hook()) settings_path.write_text(json.dumps(settings, indent=2), encoding="utf-8") print(" .gemini/settings.json -> BeforeTool hook registered") @@ -360,26 +388,27 @@ def gemini_uninstall(project_dir: Path | None = None) -> None: def vscode_install(project_dir: Path | None = None) -> None: """Install graphify skill for VS Code Copilot Chat + write .github/copilot-instructions.md.""" + _maybe_migrate(project_dir) skill_src = Path(__file__).parent / "skill-vscode.md" if not skill_src.exists(): skill_src = Path(__file__).parent / "skill-copilot.md" skill_dst = Path.home() / ".copilot" / "skills" / "graphify" / "SKILL.md" - skill_dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(skill_src, skill_dst) + _install_skill_file(skill_src, skill_dst) (skill_dst.parent / ".graphify_version").write_text(__version__, encoding="utf-8") print(f" skill installed -> {skill_dst}") instructions = (project_dir or Path(".")) / ".github" / "copilot-instructions.md" instructions.parent.mkdir(parents=True, exist_ok=True) + section = _render(_VSCODE_INSTRUCTIONS_SECTION) if instructions.exists(): content = instructions.read_text(encoding="utf-8") if _VSCODE_INSTRUCTIONS_MARKER in content: print(f" {instructions} -> already configured (no change)") else: - instructions.write_text(content.rstrip() + "\n\n" + _VSCODE_INSTRUCTIONS_SECTION, encoding="utf-8") + instructions.write_text(content.rstrip() + "\n\n" + section, encoding="utf-8") print(f" {instructions} -> graphify section added") else: - instructions.write_text(_VSCODE_INSTRUCTIONS_SECTION, encoding="utf-8") + instructions.write_text(section, encoding="utf-8") print(f" {instructions} -> created") print() @@ -462,12 +491,12 @@ def vscode_uninstall(project_dir: Path | None = None) -> None: def _kiro_install(project_dir: Path) -> None: """Write graphify skill + steering file for Kiro IDE/CLI.""" project_dir = project_dir or Path(".") + _maybe_migrate(project_dir) # Skill file → .kiro/skills/graphify/SKILL.md skill_src = Path(__file__).parent / "skill-kiro.md" skill_dst = project_dir / ".kiro" / "skills" / "graphify" / "SKILL.md" - skill_dst.parent.mkdir(parents=True, exist_ok=True) - skill_dst.write_text(skill_src.read_text(encoding="utf-8"), encoding="utf-8") + _install_skill_file(skill_src, skill_dst) print(f" {skill_dst.relative_to(project_dir)} -> /graphify skill") # Steering file → .kiro/steering/graphify.md (always-on) @@ -477,7 +506,7 @@ def _kiro_install(project_dir: Path) -> None: if steering_dst.exists() and _KIRO_STEERING_MARKER in steering_dst.read_text(encoding="utf-8"): print(f" .kiro/steering/graphify.md -> already configured") else: - steering_dst.write_text(_KIRO_STEERING, encoding="utf-8") + steering_dst.write_text(_render(_KIRO_STEERING), encoding="utf-8") print(f" .kiro/steering/graphify.md -> always-on steering written") print() @@ -527,7 +556,7 @@ def _antigravity_install(project_dir: Path) -> None: if rules_path.exists(): print(f"graphify rule already exists at {rules_path} (no change)") else: - rules_path.write_text(_ANTIGRAVITY_RULES, encoding="utf-8") + rules_path.write_text(_render(_ANTIGRAVITY_RULES), encoding="utf-8") print(f"graphify rule written to {rules_path.resolve()}") # 3. Write .agents/workflows/graphify.md @@ -546,7 +575,7 @@ def _antigravity_install(project_dir: Path) -> None: print("To enable full MCP architecture navigation, add this to ~/.gemini/antigravity/mcp_config.json:") print(' "graphify": {') print(' "command": "uv",') - print(' "args": ["run", "--with", "graphifyy", "--with", "mcp", "-m", "graphify.serve", "${workspace.path}/graphify-out/graph.json"]') + print(f' "args": ["run", "--with", "graphifyy", "--with", "mcp", "-m", "graphify.serve", "${{workspace.path}}/{_paths.home_name()}/graph.json"]') print(' }') @@ -598,12 +627,13 @@ def _antigravity_uninstall(project_dir: Path) -> None: def _cursor_install(project_dir: Path) -> None: """Write .cursor/rules/graphify.mdc with alwaysApply: true.""" + _maybe_migrate(project_dir) rule_path = (project_dir or Path(".")) / _CURSOR_RULE_PATH rule_path.parent.mkdir(parents=True, exist_ok=True) if rule_path.exists(): print(f"graphify rule already exists at {rule_path} (no change)") return - rule_path.write_text(_CURSOR_RULE, encoding="utf-8") + rule_path.write_text(_render(_CURSOR_RULE), encoding="utf-8") print(f"graphify rule written to {rule_path.resolve()}") print() print("Cursor will now always include the knowledge graph context.") @@ -655,7 +685,7 @@ def _install_opencode_plugin(project_dir: Path) -> None: """Write graphify.js plugin and register it in opencode.json.""" plugin_file = project_dir / _OPENCODE_PLUGIN_PATH plugin_file.parent.mkdir(parents=True, exist_ok=True) - plugin_file.write_text(_OPENCODE_PLUGIN_JS, encoding="utf-8") + plugin_file.write_text(_render(_OPENCODE_PLUGIN_JS), encoding="utf-8") print(f" {_OPENCODE_PLUGIN_PATH} -> tool.execute.before hook written") config_file = project_dir / _OPENCODE_CONFIG_PATH @@ -701,25 +731,28 @@ def _uninstall_opencode_plugin(project_dir: Path) -> None: print(f" {_OPENCODE_CONFIG_PATH} -> plugin deregistered") -_CODEX_HOOK = { - "hooks": { - "PreToolUse": [ - { - "matcher": "Bash", - "hooks": [ - { - "type": "command", - "command": ( - "[ -f graphify-out/graph.json ] && " - r"""echo '{"hookSpecificOutput":{"hookEventName":"PreToolUse","additionalContext":"graphify: Knowledge graph exists. Read graphify-out/GRAPH_REPORT.md for god nodes and community structure before searching raw files."}}' """ - "|| true" - ), - } - ], - } - ] +def _build_codex_hook() -> dict: + """Codex PreToolUse hook payload using the active home dir.""" + home = _paths.home_name() + return { + "hooks": { + "PreToolUse": [ + { + "matcher": "Bash", + "hooks": [ + { + "type": "command", + "command": ( + f"[ -f {home}/graph.json ] && " + f"""echo '{{"hookSpecificOutput":{{"hookEventName":"PreToolUse","additionalContext":"graphify: Knowledge graph exists. Read {home}/GRAPH_REPORT.md for god nodes and community structure before searching raw files."}}}}' """ + "|| true" + ), + } + ], + } + ] + } } -} def _install_codex_hook(project_dir: Path) -> None: @@ -737,7 +770,7 @@ def _install_codex_hook(project_dir: Path) -> None: pre_tool = existing.setdefault("hooks", {}).setdefault("PreToolUse", []) existing["hooks"]["PreToolUse"] = [h for h in pre_tool if "graphify" not in str(h)] - existing["hooks"]["PreToolUse"].extend(_CODEX_HOOK["hooks"]["PreToolUse"]) + existing["hooks"]["PreToolUse"].extend(_build_codex_hook()["hooks"]["PreToolUse"]) hooks_path.write_text(json.dumps(existing, indent=2), encoding="utf-8") print(f" .codex/hooks.json -> PreToolUse hook registered") @@ -760,17 +793,19 @@ def _uninstall_codex_hook(project_dir: Path) -> None: def _agents_install(project_dir: Path, platform: str) -> None: """Write the graphify section to the local AGENTS.md (Codex/OpenCode/OpenClaw).""" + _maybe_migrate(project_dir) target = (project_dir or Path(".")) / "AGENTS.md" + section = _render(_AGENTS_MD_SECTION) if target.exists(): content = target.read_text(encoding="utf-8") if _AGENTS_MD_MARKER in content: print(f"graphify already configured in AGENTS.md") else: - target.write_text(content.rstrip() + "\n\n" + _AGENTS_MD_SECTION, encoding="utf-8") + target.write_text(content.rstrip() + "\n\n" + section, encoding="utf-8") print(f"graphify section written to {target.resolve()}") else: - target.write_text(_AGENTS_MD_SECTION, encoding="utf-8") + target.write_text(section, encoding="utf-8") print(f"graphify section written to {target.resolve()}") if platform == "codex": @@ -819,16 +854,18 @@ def _agents_uninstall(project_dir: Path, platform: str = "") -> None: def claude_install(project_dir: Path | None = None) -> None: """Write the graphify section to the local CLAUDE.md.""" + _maybe_migrate(project_dir) target = (project_dir or Path(".")) / "CLAUDE.md" + section = _render(_CLAUDE_MD_SECTION) if target.exists(): content = target.read_text(encoding="utf-8") if _CLAUDE_MD_MARKER in content: print("graphify already configured in CLAUDE.md") return - new_content = content.rstrip() + "\n\n" + _CLAUDE_MD_SECTION + new_content = content.rstrip() + "\n\n" + section else: - new_content = _CLAUDE_MD_SECTION + new_content = section target.write_text(new_content, encoding="utf-8") print(f"graphify section written to {target.resolve()}") @@ -858,7 +895,7 @@ def _install_claude_hook(project_dir: Path) -> None: pre_tool = hooks.setdefault("PreToolUse", []) hooks["PreToolUse"] = [h for h in pre_tool if not (h.get("matcher") == "Glob|Grep" and "graphify" in str(h))] - hooks["PreToolUse"].append(_SETTINGS_HOOK) + hooks["PreToolUse"].append(_build_settings_hook()) settings_path.write_text(json.dumps(settings, indent=2), encoding="utf-8") print(f" .claude/settings.json -> PreToolUse hook registered") @@ -973,17 +1010,18 @@ def main() -> None: _check_skill_version(skill_dst) if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"): + home = _paths.home_name() print("Usage: graphify ") print() print("Commands:") print(" install [--platform P] copy skill to platform config dir (claude|windows|codex|opencode|aider|claw|droid|trae|trae-cn|gemini|cursor|antigravity|hermes|kiro)") print(" path \"A\" \"B\" shortest path between two nodes in graph.json") - print(" --graph path to graph.json (default graphify-out/graph.json)") + print(f" --graph path to graph.json (default {home}/graph.json)") print(" explain \"X\" plain-language explanation of a node and its neighbors") - print(" --graph path to graph.json (default graphify-out/graph.json)") + print(f" --graph path to graph.json (default {home}/graph.json)") print(" clone clone a GitHub repo locally and print its path for /graphify") print(" merge-graphs merge two or more graph.json files into one cross-repo graph") - print(" --out output path (default: graphify-out/merged-graph.json)") + print(f" --out output path (default: {home}/merged-graph.json)") print(" --branch checkout a specific branch (default: repo default)") print(" --out clone to a custom directory (default: ~/.graphify/repos//)") print(" add fetch a URL and save it to ./raw, then update the graph") @@ -996,15 +1034,18 @@ def main() -> None: print(" query \"\" BFS traversal of graph.json for a question") print(" --dfs use depth-first instead of breadth-first") print(" --budget N cap output at N tokens (default 2000)") - print(" --graph path to graph.json (default graphify-out/graph.json)") - print(" save-result save a Q&A result to graphify-out/memory/ for graph feedback loop") + print(f" --graph path to graph.json (default {home}/graph.json)") + print(f" save-result save a Q&A result to {home}/memory/ for graph feedback loop") print(" --question Q the question asked") print(" --answer A the answer to save") print(" --type T query type: query|path_query|explain (default: query)") print(" --nodes N1 N2 ... source node labels cited in the answer") - print(" --memory-dir DIR memory directory (default: graphify-out/memory)") + print(f" --memory-dir DIR memory directory (default: {home}/memory)") print(" check-update check needs_update flag and notify if semantic re-extraction is pending (cron-safe)") print(" benchmark [graph.json] measure token reduction vs naive full-corpus approach") + print(f" migrate-home move legacy graphify-out/ to the configured home dir (currently {home}/)") + print(" --dry-run print what would happen, don't move anything") + print(" --force merge into existing target (target files win on conflict)") print(" hook install install post-commit/post-checkout git hooks (all platforms)") print(" hook uninstall remove git hooks") print(" hook status check if git hooks are installed") @@ -1149,6 +1190,7 @@ def main() -> None: from graphify.hooks import install as hook_install, uninstall as hook_uninstall, status as hook_status subcmd = sys.argv[2] if len(sys.argv) > 2 else "" if subcmd == "install": + _maybe_migrate() print(hook_install(Path("."))) elif subcmd == "uninstall": print(hook_uninstall(Path("."))) @@ -1167,7 +1209,7 @@ def main() -> None: question = sys.argv[2] use_dfs = "--dfs" in sys.argv budget = 2000 - graph_path = "graphify-out/graph.json" + graph_path = str(_paths.graph_path()) args = sys.argv[3:] i = 0 while i < len(args): @@ -1223,7 +1265,7 @@ def main() -> None: p.add_argument("--answer", required=True) p.add_argument("--type", dest="query_type", default="query") p.add_argument("--nodes", nargs="*", default=[]) - p.add_argument("--memory-dir", default="graphify-out/memory") + p.add_argument("--memory-dir", default=str(_paths.memory_dir())) opts = p.parse_args(sys.argv[2:]) from graphify.ingest import save_query_result as _sqr out = _sqr( @@ -1243,7 +1285,7 @@ def main() -> None: import networkx as _nx source_label = sys.argv[2] target_label = sys.argv[3] - graph_path = "graphify-out/graph.json" + graph_path = str(_paths.graph_path()) args = sys.argv[4:] for i, a in enumerate(args): if a == "--graph" and i + 1 < len(args): @@ -1291,7 +1333,7 @@ def main() -> None: from graphify.serve import _find_node from networkx.readwrite import json_graph label = sys.argv[2] - graph_path = "graphify-out/graph.json" + graph_path = str(_paths.graph_path()) args = sys.argv[3:] for i, a in enumerate(args): if a == "--graph" and i + 1 < len(args): @@ -1370,7 +1412,7 @@ def main() -> None: elif cmd == "cluster-only": watch_path = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(".") - graph_json = watch_path / "graphify-out" / "graph.json" + graph_json = _paths.graph_path(watch_path) if not graph_json.exists(): print(f"error: no graph found at {graph_json} — run /graphify first", file=sys.stderr) sys.exit(1) @@ -1395,7 +1437,7 @@ def main() -> None: report = generate(G, communities, cohesion, labels, gods, surprises, {"warning": "cluster-only mode — file stats not available"}, tokens, str(watch_path), suggested_questions=questions) - out = watch_path / "graphify-out" + out = _paths.home(watch_path) (out / "GRAPH_REPORT.md").write_text(report, encoding="utf-8") to_json(G, communities, str(out / "graph.json")) to_html(G, communities, str(out / "graph.html"), community_labels=labels or None) @@ -1426,7 +1468,7 @@ def main() -> None: # graphify merge-graphs graph1.json graph2.json ... --out merged.json args = sys.argv[2:] graph_paths: list[Path] = [] - out_path = Path("graphify-out/merged-graph.json") + out_path = _paths.home() / "merged-graph.json" i = 0 while i < len(args): if args[i] == "--out" and i + 1 < len(args): @@ -1448,8 +1490,8 @@ def main() -> None: G = _jg.node_link_graph(data, edges="links") except TypeError: G = _jg.node_link_graph(data) - # Tag every node with which repo it came from - repo_tag = gp.parent.parent.name # graphify-out/../ → repo dir name + # graph.json lives at //graph.json, so repo dir is two parents up. + repo_tag = gp.parent.parent.name for node in G.nodes: G.nodes[node].setdefault("repo", repo_tag) graphs.append(G) @@ -1484,7 +1526,7 @@ def main() -> None: elif cmd == "benchmark": from graphify.benchmark import run_benchmark, print_benchmark - graph_path = sys.argv[2] if len(sys.argv) > 2 else "graphify-out/graph.json" + graph_path: str | Path = sys.argv[2] if len(sys.argv) > 2 else _paths.graph_path() # Try to load corpus_words from detect output corpus_words = None detect_path = Path(".graphify_detect.json") @@ -1496,6 +1538,55 @@ def main() -> None: pass result = run_benchmark(graph_path, corpus_words=corpus_words) print_benchmark(result) + + elif cmd == "migrate-home": + args = sys.argv[2:] + dry_run = "--dry-run" in args + force = "--force" in args + root = Path(".").resolve() + legacy = root / _paths.LEGACY_HOME_NAME + target = _paths.home(root) + + if not legacy.exists(): + print(f"No legacy {_paths.LEGACY_HOME_NAME}/ found at {root} - nothing to migrate.") + return + if target.exists() and not force: + print( + f"Both {legacy.name}/ and {target.name}/ exist. " + "Refusing to overwrite. Re-run with --force to merge " + f"{legacy.name}/ into {target.name}/, or remove one of them first.", + file=sys.stderr, + ) + sys.exit(2) + + if dry_run: + if target.exists(): + print(f"[dry-run] Would merge {legacy} -> {target}") + else: + print(f"[dry-run] Would rename {legacy} -> {target}") + return + + if not target.exists(): + target.parent.mkdir(parents=True, exist_ok=True) + legacy.rename(target) + print(f"Migrated {legacy.name}/ -> {target.name}/") + else: + # --force: shallow merge file-by-file (existing files in target win) + moved = 0 + for src in legacy.rglob("*"): + rel = src.relative_to(legacy) + dst = target / rel + if src.is_dir(): + dst.mkdir(parents=True, exist_ok=True) + continue + if dst.exists(): + continue + dst.parent.mkdir(parents=True, exist_ok=True) + src.rename(dst) + moved += 1 + shutil.rmtree(legacy, ignore_errors=True) + print(f"Merged {moved} files from {legacy.name}/ into {target.name}/ and removed the legacy directory.") + else: print(f"error: unknown command '{cmd}'", file=sys.stderr) print("Run 'graphify --help' for usage.", file=sys.stderr) diff --git a/graphify/benchmark.py b/graphify/benchmark.py index dc420564..306b8825 100644 --- a/graphify/benchmark.py +++ b/graphify/benchmark.py @@ -62,19 +62,22 @@ def _query_subgraph_tokens(G: nx.Graph, question: str, depth: int = 3) -> int: def run_benchmark( - graph_path: str = "graphify-out/graph.json", + graph_path: str | Path | None = None, corpus_words: int | None = None, questions: list[str] | None = None, ) -> dict: """Measure token reduction: corpus tokens vs graphify query tokens. Args: - graph_path: path to the built graph + graph_path: path to the built graph; defaults to ``$GRAPHIFY_HOME/graph.json`` corpus_words: total word count from detect() output; if None, estimated from graph questions: list of questions to benchmark; defaults to _SAMPLE_QUESTIONS Returns dict with: corpus_tokens, avg_query_tokens, reduction_ratio, per_question """ + if graph_path is None: + from . import paths as _paths + graph_path = _paths.graph_path() data = json.loads(Path(graph_path).read_text(encoding="utf-8")) try: G = json_graph.node_link_graph(data, edges="links") diff --git a/graphify/build.py b/graphify/build.py index 2c2c773b..96141719 100644 --- a/graphify/build.py +++ b/graphify/build.py @@ -180,16 +180,21 @@ def deduplicate_by_label(nodes: list[dict], edges: list[dict]) -> tuple[list[dic def build_merge( new_chunks: list[dict], - graph_path: str | Path = "graphify-out/graph.json", + graph_path: str | Path | None = None, prune_sources: list[str] | None = None, *, directed: bool = False, ) -> nx.Graph: """Load existing graph.json, merge new chunks into it, and save back. + *graph_path* defaults to ``$GRAPHIFY_HOME/graph.json``. + Never replaces — only grows (or prunes deleted-file nodes via prune_sources). Safe to call repeatedly: existing nodes and edges are preserved. """ + if graph_path is None: + from . import paths as _paths + graph_path = _paths.graph_path() from networkx.readwrite import json_graph as _jg graph_path = Path(graph_path) diff --git a/graphify/cache.py b/graphify/cache.py index af153d93..ae9c3e24 100644 --- a/graphify/cache.py +++ b/graphify/cache.py @@ -44,17 +44,19 @@ def file_hash(path: Path, root: Path = Path(".")) -> str: def cache_dir(root: Path = Path(".")) -> Path: - """Returns graphify-out/cache/ - creates it if needed.""" - d = Path(root).resolve() / "graphify-out" / "cache" - d.mkdir(parents=True, exist_ok=True) - return d + """Per-file extraction cache directory under *root* — created if missing. + + Resolves to ``$GRAPHIFY_HOME/cache``. See :mod:`graphify.paths`. + """ + from . import paths + return paths.cache_dir(root, create=True) def load_cached(path: Path, root: Path = Path(".")) -> dict | None: """Return cached extraction for this file if hash matches, else None. Cache key: SHA256 of file contents. - Cache value: stored as graphify-out/cache/{hash}.json + Cache value: stored as ``/{hash}.json``. Returns None if no cache entry or file has changed. """ try: @@ -73,7 +75,7 @@ def load_cached(path: Path, root: Path = Path(".")) -> dict | None: def save_cached(path: Path, result: dict, root: Path = Path(".")) -> None: """Save extraction result for this file. - Stores as graphify-out/cache/{hash}.json where hash = SHA256 of current file contents. + Stores as ``/{hash}.json`` where hash = SHA256 of current file contents. result should be a dict with 'nodes' and 'edges' lists. No-ops if `path` is not a regular file. Subagent-produced semantic fragments @@ -108,7 +110,7 @@ def cached_files(root: Path = Path(".")) -> set[str]: def clear_cache(root: Path = Path(".")) -> None: - """Delete all graphify-out/cache/*.json files.""" + """Delete all cached extraction entries.""" d = cache_dir(root) for f in d.glob("*.json"): f.unlink() diff --git a/graphify/detect.py b/graphify/detect.py index 33844929..fea917fa 100644 --- a/graphify/detect.py +++ b/graphify/detect.py @@ -1,12 +1,13 @@ # file discovery, type classification, and corpus health checks from __future__ import annotations -import fnmatch import json import os import re from enum import Enum from pathlib import Path +import pathspec + class FileType(str, Enum): CODE = "code" @@ -16,7 +17,8 @@ class FileType(str, Enum): VIDEO = "video" -_MANIFEST_PATH = "graphify-out/manifest.json" +from . import paths as _paths + CODE_EXTENSIONS = {'.py', '.ts', '.js', '.jsx', '.tsx', '.mjs', '.ejs', '.go', '.rs', '.java', '.cpp', '.cc', '.cxx', '.c', '.h', '.hpp', '.rb', '.swift', '.kt', '.kts', '.cs', '.scala', '.php', '.lua', '.toc', '.zig', '.ps1', '.ex', '.exs', '.m', '.mm', '.jl', '.vue', '.svelte', '.dart', '.v', '.sv'} DOC_EXTENSIONS = {'.md', '.mdx', '.txt', '.rst', '.html'} @@ -233,105 +235,143 @@ def count_words(path: Path) -> int: return 0 -# Directory names to always skip - venvs, caches, build artifacts, deps -_SKIP_DIRS = { - "venv", ".venv", "env", ".env", - "node_modules", "__pycache__", ".git", - "dist", "build", "target", "out", - "site-packages", "lib64", - ".pytest_cache", ".mypy_cache", ".ruff_cache", - ".tox", ".eggs", "*.egg-info", - "graphify-out", # never treat own output as source input (#524) -} - -# Large generated files that are never useful to extract -_SKIP_FILES = { +# Built-in noise prepended to every ignore chain. A user `!`-rule overrides +# any of these via last-match-wins. Build-output dirs (dist/, build/, ...) are +# intentionally NOT listed — those are project-specific and belong in .gitignore. +_BUILTIN_NOISE_PATTERNS: tuple[str, ...] = ( + ".*", + "__pycache__/", + "venv/", "env/", + "*_venv/", "*_env/", + "*.egg-info/", + "site-packages/", "lib64/", + "node_modules/", + ".graphify/", "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock", "poetry.lock", "Gemfile.lock", "composer.lock", "go.sum", "go.work.sum", -} - -def _is_noise_dir(part: str) -> bool: - """Return True if this directory name looks like a venv, cache, or dep dir.""" - if part in _SKIP_DIRS: - return True - # Catch *_venv, *_repo/site-packages patterns - if part.endswith("_venv") or part.endswith("_env"): - return True - if part.endswith(".egg-info"): - return True - return False - - -def _load_graphifyignore(root: Path) -> list[tuple[Path, str]]: - """Read .graphifyignore from root **and ancestor directories**. - - Returns a list of (anchor_dir, pattern) pairs. Each pattern is matched - against paths relative to both the scan root and the anchor_dir where - the .graphifyignore file was found — so patterns written relative to a - parent directory still work when graphify is run on a subfolder. - - Walks upward from *root* towards the filesystem root, stopping at a - ``.git`` boundary. Lines starting with # are comments; blank lines ignored. - """ - patterns: list[tuple[Path, str]] = [] - current = root.resolve() - while True: - ignore_file = current / ".graphifyignore" - if ignore_file.exists(): - for line in ignore_file.read_text(encoding="utf-8", errors="ignore").splitlines(): - line = line.strip() - if line and not line.startswith("#"): - patterns.append((current, line)) - # Stop climbing once we've processed the git repo root - if (current / ".git").exists(): - break - parent = current.parent - if parent == current: - break # filesystem root - current = parent - return patterns +) +_BUILTIN_NOISE_SPEC = pathspec.GitIgnoreSpec.from_lines(_BUILTIN_NOISE_PATTERNS) +# Pruning shortcut for the .gitignore-discovery walk only — descending into +# node_modules just to look for nested ignore files would dominate detect() time. +_DISCOVERY_SKIP_DIRS = frozenset({ + ".git", "node_modules", ".venv", "venv", "__pycache__", ".graphify", +}) -def _is_ignored(path: Path, root: Path, patterns: list[tuple[Path, str]]) -> bool: - """Return True if path matches any .graphifyignore pattern.""" - if not patterns: - return False - def _matches(rel: str, p: str) -> bool: - parts = rel.split("/") - if fnmatch.fnmatch(rel, p): - return True - if fnmatch.fnmatch(path.name, p): - return True - for i, part in enumerate(parts): - if fnmatch.fnmatch(part, p): - return True - if fnmatch.fnmatch("/".join(parts[:i + 1]), p): - return True - return False +AnchoredSpec = tuple[Path, "pathspec.PathSpec"] - for anchor, pattern in patterns: - p = pattern.strip("/") - if not p: - continue - # Try path relative to the scan root + +def _respect_gitignore() -> bool: + """Return True unless the user has opted out of .gitignore honoring.""" + flag = os.environ.get("GRAPHIFY_RESPECT_GITIGNORE", "1").strip().lower() + return flag not in ("0", "false", "no", "off") + + +def _load_ignore_file(ignore_file: Path) -> "pathspec.PathSpec | None": + """Compile a single ignore file into a gitwildmatch PathSpec, or None on read failure.""" + try: + text = ignore_file.read_text(encoding="utf-8", errors="ignore") + except OSError: + return None + spec = pathspec.GitIgnoreSpec.from_lines(text.splitlines()) + return spec if spec.patterns else None + + +def _collect_ignore_files(root: Path, names: tuple[str, ...]) -> list[Path]: + """Every ignore file (matching any of *names*) that affects *root*, in evaluation order. + + Outer-first by depth, then by *names* order within an anchor — combined with + last-match-wins in :func:`_is_ignored`, a later name overrides an earlier + co-located one. Walks up to the nearest ``.git`` so repo-level rules apply + on subdirectories, then walks down through *root* for nested rules. + """ + root = root.resolve() + + chain: list[Path] = [] + cursor = root + while True: + chain.append(cursor) + if (cursor / ".git").exists(): + break + parent = cursor.parent + if parent == cursor: + break + cursor = parent + chain.reverse() + + files: list[Path] = [] + for anc in chain: + for name in names: + f = anc / name + if f.exists(): + files.append(f) + + if root.is_dir(): + seen = set(files) + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in _DISCOVERY_SKIP_DIRS] + dp = Path(dirpath) + for name in names: + if name in filenames: + f = dp / name + if f not in seen: + seen.add(f) + files.append(f) + + return files + + +def _load_ignore_specs( + root: Path, names: tuple[str, ...] +) -> list[AnchoredSpec]: + """Load every ignore file matching *names* into anchored PathSpecs.""" + specs: list[AnchoredSpec] = [] + for ignore_file in _collect_ignore_files(root, names): + spec = _load_ignore_file(ignore_file) + if spec is not None: + specs.append((ignore_file.parent.resolve(), spec)) + return specs + + +def _load_gitignore(root: Path) -> list[AnchoredSpec]: + """Every .gitignore affecting *root*, in evaluation order. Skipped if GRAPHIFY_RESPECT_GITIGNORE=0.""" + if not _respect_gitignore(): + return [] + return _load_ignore_specs(root, (".gitignore",)) + + +def _load_graphifyignore(root: Path) -> list[AnchoredSpec]: + """Every .graphifyignore affecting *root*, in evaluation order. Same syntax as .gitignore.""" + return _load_ignore_specs(root, (".graphifyignore",)) + + +def _is_ignored( + path: Path, + specs: list[AnchoredSpec], + *, + is_dir: bool = False, +) -> bool: + """Last-match-wins across the spec chain. Pass *is_dir* True so dir-only patterns fire. + + *path* must be absolute and resolved. Each spec is anchored to its source file's + directory; patterns outside that subtree don't apply. A re-include via ``!`` cannot + rescue a file from a parent dir that was already pruned — the caller enforces this + by not descending into ignored dirs. + """ + state = False + for anchor, spec in specs: try: - rel = str(path.relative_to(root)).replace(os.sep, "/") - if _matches(rel, p): - return True + rel = path.relative_to(anchor).as_posix() except ValueError: - pass - # Also try relative to the anchor dir (the .graphifyignore's location), - # so patterns written at a parent level still fire when running on a subfolder - if anchor != root: - try: - rel_anchor = str(path.relative_to(anchor)).replace(os.sep, "/") - if _matches(rel_anchor, p): - return True - except ValueError: - pass - return False + continue + if is_dir and not rel.endswith("/"): + rel = rel + "/" + result = spec.check_file(rel) + if result.include is not None: + state = result.include + return state def detect(root: Path, *, follow_symlinks: bool = False) -> dict: @@ -346,19 +386,23 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict: total_words = 0 skipped_sensitive: list[str] = [] - ignore_patterns = _load_graphifyignore(root) - - # Always include graphify-out/memory/ - query results filed back into the graph - memory_dir = root / "graphify-out" / "memory" - scan_paths = [root] + ignore_names = (".graphifyignore",) + if _respect_gitignore(): + ignore_names = (".gitignore",) + ignore_names + user_specs = _load_ignore_specs(root, ignore_names) + ignore_patterns: list[AnchoredSpec] = [(root, _BUILTIN_NOISE_SPEC), *user_specs] + + # memory dir scans without ignore filtering — its contents are wanted + # even though it lives under .graphify/ which the noise spec prunes. + memory_dir = _paths.memory_dir(root) + scan_paths: list[tuple[Path, list[AnchoredSpec]]] = [(root, ignore_patterns)] if memory_dir.exists(): - scan_paths.append(memory_dir) + scan_paths.append((memory_dir, [])) seen: set[Path] = set() all_files: list[Path] = [] - for scan_root in scan_paths: - in_memory_tree = memory_dir.exists() and str(scan_root).startswith(str(memory_dir)) + for scan_root, scan_specs in scan_paths: for dirpath, dirnames, filenames in os.walk(scan_root, followlinks=follow_symlinks): dp = Path(dirpath) if follow_symlinks and os.path.islink(dirpath): @@ -367,37 +411,17 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict: if parent_real == real or parent_real.startswith(real + os.sep): dirnames.clear() continue - if not in_memory_tree: - # Prune noise dirs in-place so os.walk never descends into them - dirnames[:] = [ - d for d in dirnames - if not d.startswith(".") - and not _is_noise_dir(d) - and not _is_ignored(dp / d, root, ignore_patterns) - ] + dirnames[:] = [d for d in dirnames if not _is_ignored(dp / d, scan_specs, is_dir=True)] for fname in filenames: - if fname in _SKIP_FILES: - continue p = dp / fname - if p not in seen: - seen.add(p) - all_files.append(p) + if p in seen or _is_ignored(p, scan_specs): + continue + seen.add(p) + all_files.append(p) - converted_dir = root / "graphify-out" / "converted" + converted_dir = _paths.converted_dir(root) for p in all_files: - # For memory dir files, skip hidden/noise filtering - in_memory = memory_dir.exists() and str(p).startswith(str(memory_dir)) - if not in_memory: - # Hidden files are already excluded via dir pruning above, - # but catch hidden files at the root level - if p.name.startswith("."): - continue - # Skip files inside our own converted/ dir (avoid re-processing sidecars) - if str(p).startswith(str(converted_dir)): - continue - if _is_ignored(p, root, ignore_patterns): - continue if _is_sensitive(p): skipped_sensitive.append(str(p)) continue @@ -441,20 +465,24 @@ def detect(root: Path, *, follow_symlinks: bool = False) -> dict: "needs_graph": needs_graph, "warning": warning, "skipped_sensitive": skipped_sensitive, - "graphifyignore_patterns": len(ignore_patterns), + "graphifyignore_patterns": sum(len(spec.patterns) for _, spec in user_specs), } -def load_manifest(manifest_path: str = _MANIFEST_PATH) -> dict[str, float]: - """Load the file modification time manifest from a previous run.""" +def load_manifest(manifest_path: str | Path | None = None) -> dict[str, float]: + """Load the file mtime manifest from a previous run.""" + if manifest_path is None: + manifest_path = _paths.manifest_path() try: return json.loads(Path(manifest_path).read_text(encoding="utf-8")) except Exception: return {} -def save_manifest(files: dict[str, list[str]], manifest_path: str = _MANIFEST_PATH) -> None: - """Save current file mtimes so the next --update run can diff against them.""" +def save_manifest(files: dict[str, list[str]], manifest_path: str | Path | None = None) -> None: + """Save current file mtimes for the next --update diff.""" + if manifest_path is None: + manifest_path = _paths.manifest_path() manifest: dict[str, float] = {} for file_list in files.values(): for f in file_list: @@ -466,7 +494,7 @@ def save_manifest(files: dict[str, list[str]], manifest_path: str = _MANIFEST_PA Path(manifest_path).write_text(json.dumps(manifest, indent=2), encoding="utf-8") -def detect_incremental(root: Path, manifest_path: str = _MANIFEST_PATH) -> dict: +def detect_incremental(root: Path, manifest_path: str | Path | None = None) -> dict: """Like detect(), but returns only new or modified files since the last run. Compares current file mtimes against the stored manifest. diff --git a/graphify/extract.py b/graphify/extract.py index dbd441c6..40217d98 100644 --- a/graphify/extract.py +++ b/graphify/extract.py @@ -3206,9 +3206,9 @@ def extract(paths: list[Path], cache_root: Path | None = None) -> dict: Args: paths: files to extract from - cache_root: explicit root for graphify-out/cache/ (overrides the + cache_root: explicit root for the per-file cache (overrides the inferred common path prefix). Pass Path('.') when running on a - subdirectory so the cache stays at ./graphify-out/cache/. + subdirectory so the cache stays at ``.//cache/``. """ _check_tree_sitter_version() per_file: list[dict] = [] @@ -3391,38 +3391,26 @@ def collect_files(target: Path, *, follow_symlinks: bool = False, root: Path | N ".lua", ".toc", ".zig", ".ps1", ".m", ".mm", } - from graphify.detect import _load_graphifyignore, _is_ignored + from graphify.detect import _BUILTIN_NOISE_SPEC, _load_graphifyignore, _is_ignored ignore_root = root if root is not None else target - patterns = _load_graphifyignore(ignore_root) - - def _ignored(p: Path) -> bool: - return bool(patterns and _is_ignored(p, ignore_root, patterns)) - - if not follow_symlinks: - results: list[Path] = [] - for ext in sorted(_EXTENSIONS): - results.extend( - p for p in target.rglob(f"*{ext}") - if not any(part.startswith(".") for part in p.parts) - and not _ignored(p) - ) - return sorted(results) - # Walk with symlink following + cycle detection - results = [] - for dirpath, dirnames, filenames in os.walk(target, followlinks=True): - if os.path.islink(dirpath): + target_resolved = target.resolve() + # Prepend the built-in noise spec so node_modules / venvs / dotdirs are + # pruned by the same matcher as user .graphifyignore rules. + patterns = [(target_resolved, _BUILTIN_NOISE_SPEC), *_load_graphifyignore(ignore_root)] + + results: list[Path] = [] + for dirpath, dirnames, filenames in os.walk(target, followlinks=follow_symlinks): + dp = Path(dirpath) + if follow_symlinks and os.path.islink(dirpath): real = os.path.realpath(dirpath) parent_real = os.path.realpath(os.path.dirname(dirpath)) if parent_real == real or parent_real.startswith(real + os.sep): dirnames.clear() continue - dp = Path(dirpath) - if any(part.startswith(".") for part in dp.parts): - dirnames.clear() - continue + dirnames[:] = [d for d in dirnames if not _is_ignored(dp / d, patterns, is_dir=True)] for fname in filenames: p = dp / fname - if p.suffix in _EXTENSIONS and not fname.startswith(".") and not _ignored(p): + if p.suffix in _EXTENSIONS and not _is_ignored(p, patterns): results.append(p) return sorted(results) diff --git a/graphify/hooks.py b/graphify/hooks.py index 3fa7d2e5..67e9d37b 100644 --- a/graphify/hooks.py +++ b/graphify/hooks.py @@ -98,8 +98,10 @@ exit 0 fi -# Only run if graphify-out/ exists (graph has been built before) -if [ ! -d "graphify-out" ]; then +# Only run if the graphify home dir exists (graph has been built before). +# Honors $GRAPHIFY_HOME if set, defaulting to .graphify. +GRAPHIFY_HOME_DIR="${GRAPHIFY_HOME:-.graphify}" +if [ ! -d "$GRAPHIFY_HOME_DIR" ]; then exit 0 fi diff --git a/graphify/ingest.py b/graphify/ingest.py index 62d8386b..cb2c6c04 100644 --- a/graphify/ingest.py +++ b/graphify/ingest.py @@ -244,9 +244,9 @@ def save_query_result( ) -> Path: """Save a Q&A result as markdown so it gets extracted into the graph on next --update. - Files are stored in memory_dir (typically graphify-out/memory/) with YAML frontmatter - that graphify's extractor reads as node metadata. This closes the feedback loop: - the system grows smarter from both what you add AND what you ask. + Files are stored in *memory_dir* (typically ``$GRAPHIFY_HOME/memory/``) with YAML + frontmatter that graphify's extractor reads as node metadata. This closes the + feedback loop: the system grows smarter from both what you add AND what you ask. """ memory_dir = Path(memory_dir) memory_dir.mkdir(parents=True, exist_ok=True) diff --git a/graphify/paths.py b/graphify/paths.py new file mode 100644 index 00000000..b161187e --- /dev/null +++ b/graphify/paths.py @@ -0,0 +1,75 @@ +# Configurable path resolution for graphify outputs (set via GRAPHIFY_HOME). +from __future__ import annotations + +import os +from pathlib import Path + +DEFAULT_HOME_NAME = ".graphify" +LEGACY_HOME_NAME = "graphify-out" +ENV_HOME = "GRAPHIFY_HOME" + + +def home_name() -> str: + """Configured home dir name (env GRAPHIFY_HOME or DEFAULT_HOME_NAME). Read each call.""" + val = os.environ.get(ENV_HOME, "").strip() + return val or DEFAULT_HOME_NAME + + +def home(root: Path | str = Path(".")) -> Path: + return Path(root).resolve() / home_name() + + +def cache_dir(root: Path | str = Path("."), *, create: bool = True) -> Path: + d = home(root) / "cache" + if create: + d.mkdir(parents=True, exist_ok=True) + return d + + +def manifest_path(root: Path | str = Path(".")) -> Path: + return home(root) / "manifest.json" + + +def memory_dir(root: Path | str = Path(".")) -> Path: + return home(root) / "memory" + + +def converted_dir(root: Path | str = Path(".")) -> Path: + return home(root) / "converted" + + +def graph_path(root: Path | str = Path(".")) -> Path: + return home(root) / "graph.json" + + +def report_path(root: Path | str = Path(".")) -> Path: + return home(root) / "GRAPH_REPORT.md" + + +def cost_path(root: Path | str = Path(".")) -> Path: + return home(root) / "cost.json" + + +def needs_update_path(root: Path | str = Path(".")) -> Path: + return home(root) / "needs_update" + + +def has_legacy_layout(root: Path | str = Path(".")) -> bool: + """True iff GRAPHIFY_HOME unset, ``graphify-out/`` exists, and the default home does not.""" + if os.environ.get(ENV_HOME, "").strip(): + return False + r = Path(root).resolve() + return (r / LEGACY_HOME_NAME).is_dir() and not (r / DEFAULT_HOME_NAME).exists() + + +def auto_migrate(root: Path | str = Path(".")) -> bool: + """Rename ``graphify-out/`` to the configured home on legacy layouts. Returns True if migrated. + + Conservative: never overwrites or merges. Use ``graphify migrate-home --force`` + to resolve a side-by-side layout manually. + """ + if not has_legacy_layout(root): + return False + r = Path(root).resolve() + (r / LEGACY_HOME_NAME).rename(r / home_name()) + return True diff --git a/graphify/security.py b/graphify/security.py index 0d906013..e7baebab 100644 --- a/graphify/security.py +++ b/graphify/security.py @@ -144,7 +144,9 @@ def safe_fetch_text(url: str, max_bytes: int = _MAX_TEXT_BYTES, timeout: int = 1 def validate_graph_path(path: str | Path, base: Path | None = None) -> Path: """Resolve *path* and verify it stays inside *base*. - *base* defaults to the `graphify-out` directory relative to CWD. + *base* defaults to the configured graphify home directory (see + :mod:`graphify.paths`) under CWD. + Also requires the base directory to exist, so a caller cannot trick graphify into reading files before any graph has been built. @@ -152,15 +154,10 @@ def validate_graph_path(path: str | Path, base: Path | None = None) -> Path: ValueError - path escapes base, or base does not exist FileNotFoundError - resolved path does not exist """ - if base is None: - resolved_hint = Path(path).resolve() - for candidate in [resolved_hint, *resolved_hint.parents]: - if candidate.name == "graphify-out": - base = candidate - break - if base is None: - base = Path("graphify-out").resolve() + from . import paths as _paths + if base is None: + base = _paths.home() base = base.resolve() if not base.exists(): raise ValueError( @@ -174,7 +171,7 @@ def validate_graph_path(path: str | Path, base: Path | None = None) -> Path: except ValueError: raise ValueError( f"Path {path!r} escapes the allowed directory {base}. " - "Only paths inside graphify-out/ are permitted." + f"Only paths inside {base.name}/ are permitted." ) if not resolved.exists(): diff --git a/graphify/serve.py b/graphify/serve.py index 361dec3c..b091193d 100644 --- a/graphify/serve.py +++ b/graphify/serve.py @@ -147,8 +147,14 @@ def _relay() -> None: sys.stdin = open(0, "r", closefd=False) -def serve(graph_path: str = "graphify-out/graph.json") -> None: - """Start the MCP server. Requires pip install mcp.""" +def serve(graph_path: str | Path | None = None) -> None: + """Start the MCP server. Requires pip install mcp. + + *graph_path* defaults to ``$GRAPHIFY_HOME/graph.json``. + """ + if graph_path is None: + from . import paths as _paths + graph_path = _paths.graph_path() try: from mcp.server import Server from mcp.server.stdio import stdio_server @@ -369,5 +375,7 @@ async def main() -> None: if __name__ == "__main__": - graph_path = sys.argv[1] if len(sys.argv) > 1 else "graphify-out/graph.json" - serve(graph_path) + if len(sys.argv) > 1: + serve(sys.argv[1]) + else: + serve() diff --git a/graphify/transcribe.py b/graphify/transcribe.py index 70000757..3de186e1 100644 --- a/graphify/transcribe.py +++ b/graphify/transcribe.py @@ -5,12 +5,13 @@ import os from pathlib import Path +from graphify import paths as _paths + VIDEO_EXTENSIONS = {'.mp4', '.mov', '.webm', '.mkv', '.avi', '.m4v', '.mp3', '.wav', '.m4a', '.ogg'} URL_PREFIXES = ('http://', 'https://', 'www.') _DEFAULT_MODEL = "base" -_TRANSCRIPTS_DIR = "graphify-out/transcripts" _FALLBACK_PROMPT = "Use proper punctuation and paragraph breaks." @@ -126,7 +127,7 @@ def transcribe( initial_prompt: domain hint for Whisper (built from corpus god nodes). force: re-transcribe even if transcript already exists. """ - out_dir = Path(output_dir) if output_dir else Path(_TRANSCRIPTS_DIR) + out_dir = Path(output_dir) if output_dir else _paths.home() / "transcripts" out_dir.mkdir(parents=True, exist_ok=True) if is_url(str(video_path)): diff --git a/graphify/watch.py b/graphify/watch.py index a09dd51e..45b1087b 100644 --- a/graphify/watch.py +++ b/graphify/watch.py @@ -5,7 +5,7 @@ import time from pathlib import Path - +from graphify import paths as _paths from graphify.detect import CODE_EXTENSIONS, DOC_EXTENSIONS, PAPER_EXTENSIONS, IMAGE_EXTENSIONS _WATCHED_EXTENSIONS = CODE_EXTENSIONS | DOC_EXTENSIONS | PAPER_EXTENSIONS | IMAGE_EXTENSIONS @@ -61,7 +61,7 @@ def _rebuild_code(watch_path: Path, *, follow_symlinks: bool = False) -> bool: # Preserve semantic nodes/edges from a previous full run. # AST-only rebuild replaces code nodes; doc/paper/image nodes are kept. - out = watch_path / "graphify-out" + out = _paths.home(watch_path) existing_graph = out / "graph.json" if existing_graph.exists(): try: @@ -140,7 +140,7 @@ def check_update(watch_path: Path) -> bool: re-extraction via `/graphify --update` — this function only signals that the update is needed. """ - flag = Path(watch_path) / "graphify-out" / "needs_update" + flag = _paths.needs_update_path(Path(watch_path)) if flag.exists(): print(f"[graphify check-update] Pending non-code changes in {watch_path}.") print("[graphify check-update] Run `/graphify --update` to apply semantic re-extraction.") @@ -149,7 +149,7 @@ def check_update(watch_path: Path) -> bool: def _notify_only(watch_path: Path) -> None: """Write a flag file and print a notification (fallback for non-code-only corpora).""" - flag = watch_path / "graphify-out" / "needs_update" + flag = _paths.needs_update_path(watch_path) flag.parent.mkdir(parents=True, exist_ok=True) flag.write_text("1", encoding="utf-8") print(f"\n[graphify watch] New or changed files detected in {watch_path}") @@ -194,7 +194,7 @@ def on_any_event(self, event): return if any(part.startswith(".") for part in path.parts): return - if "graphify-out" in path.parts: + if _paths.home_name() in path.parts: return last_trigger = time.monotonic() pending = True diff --git a/pyproject.toml b/pyproject.toml index 058f6de5..6fbc14a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ keywords = ["claude", "claude-code", "codex", "opencode", "cursor", "gemini", "a requires-python = ">=3.10,<3.14" dependencies = [ "networkx", + "pathspec>=0.12", "tree-sitter>=0.23.0", "tree-sitter-python", "tree-sitter-javascript", diff --git a/tests/test_cache.py b/tests/test_cache.py index fd57cad1..4fd5dcc6 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -67,11 +67,12 @@ def test_cached_files(tmp_path, cache_root): def test_clear_cache(tmp_file, cache_root): - """clear_cache removes all .json files from graphify-out/cache/.""" + """clear_cache removes all .json files from the resolved cache dir.""" + cd = cache_dir(cache_root) save_cached(tmp_file, {"nodes": [], "edges": []}, root=cache_root) - assert len(list((cache_root / "graphify-out" / "cache").glob("*.json"))) > 0 + assert len(list(cd.glob("*.json"))) > 0 clear_cache(cache_root) - assert len(list((cache_root / "graphify-out" / "cache").glob("*.json"))) == 0 + assert len(list(cd.glob("*.json"))) == 0 def test_md_frontmatter_only_change_same_hash(tmp_path): diff --git a/tests/test_gitignore.py b/tests/test_gitignore.py new file mode 100644 index 00000000..ee0e69e5 --- /dev/null +++ b/tests/test_gitignore.py @@ -0,0 +1,495 @@ +"""Tests for .gitignore-aware detection in detect.py. + +The implementation uses :mod:`pathspec` with the gitwildmatch flavor, so the +full git ignore syntax is supported: trailing-slash dir-only, leading-slash +anchored, ``**`` recursive globs, and ``!``-negation that re-includes +previously-ignored entries (last match wins, parent-first). + +Note on git's documented limitation: a ``!`` rule cannot rescue a file from +an already-pruned parent directory. We follow git here — once a directory is +ignored, paths underneath cannot be re-included. +""" +from __future__ import annotations + +from pathlib import Path + +import pytest + +from graphify.detect import detect, _is_ignored, _load_gitignore, _load_graphifyignore + + +@pytest.fixture(autouse=True) +def _isolate_env(monkeypatch): + monkeypatch.delenv("GRAPHIFY_RESPECT_GITIGNORE", raising=False) + yield + + +def _write(p: Path, content: str = "x") -> None: + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content, encoding="utf-8") + + +def _project(tmp_path: Path) -> Path: + """Make tmp_path look like a git repo so the loader anchors to it.""" + (tmp_path / ".git").mkdir() + return tmp_path.resolve() + + +def _detected(root: Path) -> list[str]: + """Return all detected file paths as posix strings relative to *root*.""" + result = detect(root) + files = [f for files in result["files"].values() for f in files] + return sorted( + Path(f).resolve().relative_to(root.resolve()).as_posix() + for f in files + ) + + +# --------------------------------------------------------------------------- +# Loader-level tests +# --------------------------------------------------------------------------- + + +def test_load_gitignore_returns_anchored_specs(tmp_path): + """Loader returns (anchor_dir, PathSpec) pairs.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("dist/\n*.tmp\n# comment\n\n", encoding="utf-8") + specs = _load_gitignore(root) + assert len(specs) == 1 + anchor, spec = specs[0] + assert anchor == root.resolve() + # gitwildmatch keeps a Pattern object even for comments / blank lines, but + # those have regex=None — count only real, matchable patterns. + real = [p for p in spec.patterns if p.regex is not None] + assert len(real) == 2 + + +def test_no_gitignore_returns_empty(tmp_path): + root = _project(tmp_path) + assert _load_gitignore(root) == [] + + +def test_env_opt_out(tmp_path, monkeypatch): + """GRAPHIFY_RESPECT_GITIGNORE=0 disables gitignore loading entirely.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("dist/\n", encoding="utf-8") + for falsy in ("0", "false", "no", "off"): + monkeypatch.setenv("GRAPHIFY_RESPECT_GITIGNORE", falsy) + assert _load_gitignore(root) == [] + monkeypatch.setenv("GRAPHIFY_RESPECT_GITIGNORE", "1") + assert _load_gitignore(root) != [] + + +def test_loader_walks_to_repo_root(tmp_path): + """A .gitignore at the repo root is found even when scanning a subfolder.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("ignored/\n", encoding="utf-8") + sub = root / "src" + sub.mkdir() + specs = _load_gitignore(sub) + anchors = [a for a, _ in specs] + assert root.resolve() in anchors + + +def test_nested_gitignore_loaded(tmp_path): + """Nested .gitignore files inside the scan root are picked up too.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("dist/\n", encoding="utf-8") + nested = root / "pkg" / ".gitignore" + _write(nested, "*.cache\n") + specs = _load_gitignore(root) + anchors = {a for a, _ in specs} + assert root.resolve() in anchors + assert nested.parent.resolve() in anchors + + +# --------------------------------------------------------------------------- +# Full wildmatch syntax — these are the cases that fnmatch couldn't honor. +# --------------------------------------------------------------------------- + + +def test_negation_re_includes_file_at_same_level(tmp_path): + """`!keep.py` re-includes a file that an earlier pattern would exclude. + + File-level negation works as long as the parent directory is not itself + excluded (see :func:`test_negation_does_not_resurrect_pruned_directory`). + """ + root = _project(tmp_path) + (root / ".gitignore").write_text("*.py\n!keep.py\n", encoding="utf-8") + _write(root / "noise.py") + _write(root / "keep.py") + _write(root / "doc.md") + + rel = _detected(root) + assert "keep.py" in rel + assert "doc.md" in rel + assert "noise.py" not in rel + + +def test_double_star_glob(tmp_path): + """`**/foo/` matches the directory at any depth.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("**/__generated__/\n", encoding="utf-8") + _write(root / "a" / "__generated__" / "x.py") + _write(root / "a" / "b" / "__generated__" / "y.py") + _write(root / "a" / "b" / "real.py") + + rel = _detected(root) + assert "a/b/real.py" in rel + assert not any("__generated__" in f for f in rel) + + +def test_leading_slash_is_anchored(tmp_path): + """`/build/` is anchored to the .gitignore's directory only.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("/build/\n", encoding="utf-8") + _write(root / "build" / "out.py") + # A nested directory named "build" must NOT be excluded. + _write(root / "src" / "build" / "kept.py") + + rel = _detected(root) + assert "src/build/kept.py" in rel + assert not any(f.startswith("build/") for f in rel) + + +def test_trailing_slash_means_directory_only(tmp_path): + """`foo/` matches a directory called foo, not a file called foo.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("logs/\n", encoding="utf-8") + _write(root / "logs" / "app.py") + # A file literally named "logs" must not be excluded. + (root / "logs.md").write_text("# notes", encoding="utf-8") + + rel = _detected(root) + assert "logs.md" in rel + assert not any(f.startswith("logs/") for f in rel) + + +def test_nested_gitignore_overrides_parent(tmp_path): + """A child .gitignore's negation re-includes a file the parent ignored. + + Both rules are file-level (`*.py`, `!important.py`), neither prunes a + parent directory, so the negation actually fires. + """ + root = _project(tmp_path) + (root / ".gitignore").write_text("*.py\n", encoding="utf-8") + sub = root / "deploy" + sub.mkdir() + (sub / ".gitignore").write_text("!important.py\n", encoding="utf-8") + + _write(root / "noise.py") + _write(sub / "important.py") + _write(sub / "trace.py") + + rel = _detected(root) + assert "deploy/important.py" in rel + assert "noise.py" not in rel + assert "deploy/trace.py" not in rel + + +def test_negation_does_not_resurrect_pruned_directory(tmp_path): + """Documented git limitation: !file inside an ignored dir cannot recover it. + + git's own docs spell this out — once the directory is excluded, paths + inside it cannot be re-included. We honor the same rule because we + prune ignored directories at os.walk time for performance. + """ + root = _project(tmp_path) + (root / ".gitignore").write_text("excluded/\n!excluded/keep.py\n", encoding="utf-8") + _write(root / "excluded" / "keep.py") + _write(root / "excluded" / "junk.py") + _write(root / "src" / "main.py") + + rel = _detected(root) + assert "src/main.py" in rel + # excluded/ was pruned wholesale; this matches git's own behavior. + assert not any(f.startswith("excluded/") for f in rel) + + +def test_user_negation_overrides_default_dotfile_skip(tmp_path): + """A user `!.config/` rule re-includes a directory we'd otherwise skip.""" + # By default we prune dotfile directories. An explicit !-rule re-includes. + root = _project(tmp_path) + (root / ".gitignore").write_text("!.config/\n", encoding="utf-8") + _write(root / ".config" / "settings.py") + _write(root / "main.py") + + rel = _detected(root) + assert "main.py" in rel + assert ".config/settings.py" in rel + + +# --------------------------------------------------------------------------- +# Integration with detect() +# --------------------------------------------------------------------------- + + +def test_detect_excludes_gitignored_files(tmp_path): + root = _project(tmp_path) + (root / ".gitignore").write_text("ignored/\n*.secret\n", encoding="utf-8") + + _write(root / "kept.py", "print('hi')") + _write(root / "kept.md", "# Kept") + _write(root / "ignored" / "junk.py", "x = 1") + _write(root / "leak.secret", "topsecret") + + rel = _detected(root) + assert "kept.py" in rel + assert "kept.md" in rel + assert not any(f.startswith("ignored/") for f in rel) + assert "leak.secret" not in rel + + +def test_detect_respects_opt_out(tmp_path, monkeypatch): + root = _project(tmp_path) + (root / ".gitignore").write_text("ignored/\n", encoding="utf-8") + _write(root / "kept.py") + _write(root / "ignored" / "junk.py") + + monkeypatch.setenv("GRAPHIFY_RESPECT_GITIGNORE", "0") + rel = _detected(root) + assert "kept.py" in rel + assert any(f.startswith("ignored/") for f in rel) + + +def test_detect_combines_gitignore_and_graphifyignore(tmp_path): + root = _project(tmp_path) + (root / ".gitignore").write_text("from-git/\n", encoding="utf-8") + (root / ".graphifyignore").write_text("from-graphify/\n", encoding="utf-8") + + _write(root / "kept.py") + _write(root / "from-git" / "a.py") + _write(root / "from-graphify" / "b.py") + + rel = _detected(root) + assert "kept.py" in rel + assert not any(f.startswith("from-git/") for f in rel) + assert not any(f.startswith("from-graphify/") for f in rel) + + +def test_graphifyignore_supports_full_syntax(tmp_path): + """The new pathspec backend lifts .graphifyignore to full gitwildmatch too. + + Uses file-level patterns (no dir-level pruning) so the negation actually + fires within the same directory. + """ + root = _project(tmp_path) + (root / ".graphifyignore").write_text( + "*.py\n" + "!keep.py\n" + "**/__cache__/\n", + encoding="utf-8", + ) + _write(root / "noise.py") + _write(root / "keep.py") + _write(root / "src" / "__cache__" / "x.py") + _write(root / "src" / "kept.md") + + rel = _detected(root) + assert "keep.py" in rel + assert "src/kept.md" in rel + assert "noise.py" not in rel + assert not any("__cache__" in f for f in rel) + + +# --------------------------------------------------------------------------- +# Direct _is_ignored unit checks (without going through detect()) +# --------------------------------------------------------------------------- + + +def test_is_ignored_returns_false_for_no_specs(tmp_path): + """Empty spec list means no opinion — file is kept.""" + assert _is_ignored(tmp_path / "foo.py", []) is False + + +def test_is_ignored_outside_anchor_subtree(tmp_path): + """A spec under subdir/ must not affect paths outside subdir/.""" + root = _project(tmp_path) + (root / "subdir").mkdir() + (root / "subdir" / ".gitignore").write_text("*.bin\n", encoding="utf-8") + specs = _load_gitignore(root) + + inside = root / "subdir" / "blob.bin" + outside = root / "blob.bin" + _write(inside) + _write(outside) + + assert _is_ignored(inside, specs) is True + # Outside the anchor subtree no spec applies — file is kept. + assert _is_ignored(outside, specs) is False + + +def test_is_ignored_dir_flag(tmp_path): + """Passing is_dir=True lets dir-only patterns (`build/`) match the bare path.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("build/\n", encoding="utf-8") + specs = _load_gitignore(root) + build = root / "build" + build.mkdir() + + # Without is_dir the trailing-slash pattern can't match the bare name. + assert _is_ignored(build, specs, is_dir=False) is False + # With is_dir the helper appends the slash so the pattern matches. + assert _is_ignored(build, specs, is_dir=True) is True + + +def test_is_ignored_returns_false_on_negation(tmp_path): + """A `!`-rule that fires last keeps the file (returns False).""" + root = _project(tmp_path) + (root / ".gitignore").write_text("*.py\n!keep.py\n", encoding="utf-8") + specs = _load_gitignore(root) + f = root / "keep.py" + _write(f) + assert _is_ignored(f, specs) is False + + +# --------------------------------------------------------------------------- +# Unified .gitignore + .graphifyignore handling. +# +# A .graphifyignore is just a renamed .gitignore for graphify-specific rules. +# Both files are loaded by the same discovery walk and feed the same matcher; +# the only ordering rule is that within a single directory, .gitignore is +# evaluated first and .graphifyignore second, so graphify-specific rules can +# override their co-located git rule via last-match-wins. +# --------------------------------------------------------------------------- + + +def test_graphifyignore_can_override_gitignore_at_same_anchor(tmp_path): + """A `!` in .graphifyignore re-includes a path .gitignore tried to exclude. + + Both files live in the same directory; `.graphifyignore` is evaluated + second, so its negation wins. + """ + root = _project(tmp_path) + (root / ".gitignore").write_text("*.py\n", encoding="utf-8") + (root / ".graphifyignore").write_text("!analytics.py\n", encoding="utf-8") + _write(root / "noise.py") + _write(root / "analytics.py") + + rel = _detected(root) + assert "analytics.py" in rel + assert "noise.py" not in rel + + +def test_graphifyignore_supports_nested_files(tmp_path): + """Nested .graphifyignore files are picked up just like nested .gitignore.""" + root = _project(tmp_path) + sub = root / "pkg" + sub.mkdir() + (sub / ".graphifyignore").write_text("*.py\n!keep.py\n", encoding="utf-8") + _write(root / "top.py") + _write(sub / "noise.py") + _write(sub / "keep.py") + + rel = _detected(root) + assert "top.py" in rel + assert "pkg/keep.py" in rel + assert "pkg/noise.py" not in rel + + +def test_gitignore_opt_out_does_not_disable_graphifyignore(tmp_path, monkeypatch): + """Setting GRAPHIFY_RESPECT_GITIGNORE=0 only silences .gitignore.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("kept.py\n", encoding="utf-8") + (root / ".graphifyignore").write_text("dropped.py\n", encoding="utf-8") + _write(root / "kept.py") + _write(root / "dropped.py") + _write(root / "main.py") + + monkeypatch.setenv("GRAPHIFY_RESPECT_GITIGNORE", "0") + rel = _detected(root) + # .gitignore is silenced -> kept.py comes back. + assert "kept.py" in rel + # .graphifyignore is unaffected -> dropped.py stays excluded. + assert "dropped.py" not in rel + assert "main.py" in rel + + +# --------------------------------------------------------------------------- +# Built-in noise spec — the noise-prune rules are now expressed as a +# GitIgnoreSpec prepended to the user's chain. This means user `!`-rules +# naturally override built-in noise via last-match-wins, and the same +# matcher handles every kind of pruning (no parallel _SKIP_DIRS list). +# --------------------------------------------------------------------------- + + +def test_noise_spec_prunes_dotfiles_and_dotdirs(tmp_path): + """Dotfiles + dotdirs anywhere are pruned by the built-in noise spec.""" + root = _project(tmp_path) + _write(root / ".eslintrc") + _write(root / "src" / ".gitkeep") + _write(root / "src" / "main.py") + _write(root / ".pytest_cache" / "v" / "cache" / "data.txt") + rel = _detected(root) + assert ".eslintrc" not in rel + assert "src/.gitkeep" not in rel + assert "src/main.py" in rel + assert all(not f.startswith(".pytest_cache/") for f in rel) + + +def test_noise_spec_prunes_lockfiles_anywhere(tmp_path): + """Lockfiles like package-lock.json match anywhere via basename.""" + root = _project(tmp_path) + _write(root / "package-lock.json") + _write(root / "Cargo.lock") + _write(root / "subdir" / "yarn.lock") + _write(root / "main.py") + rel = _detected(root) + assert "package-lock.json" not in rel + assert "Cargo.lock" not in rel + assert "subdir/yarn.lock" not in rel + assert "main.py" in rel + + +def test_noise_spec_prunes_venv_suffix_dirs(tmp_path): + """*_venv and *_env wildcard dir patterns prune custom-named virtualenvs.""" + root = _project(tmp_path) + _write(root / "myproj_venv" / "lib" / "site.py") + _write(root / "tools_env" / "bin" / "tool.py") + _write(root / "venv" / "lib" / "x.py") + _write(root / "node_modules" / "pkg" / "index.js") + _write(root / "src" / "main.py") + rel = _detected(root) + assert all(not f.startswith("myproj_venv/") for f in rel) + assert all(not f.startswith("tools_env/") for f in rel) + assert all(not f.startswith("venv/") for f in rel) + assert all(not f.startswith("node_modules/") for f in rel) + assert "src/main.py" in rel + + +def test_noise_spec_prunes_egg_info(tmp_path): + """*.egg-info/ wildcard dir pattern prunes packaging metadata dirs.""" + root = _project(tmp_path) + _write(root / "mypackage.egg-info" / "PKG-INFO") + _write(root / "src" / "main.py") + rel = _detected(root) + assert all(not f.startswith("mypackage.egg-info/") for f in rel) + assert "src/main.py" in rel + + +def test_noise_spec_prunes_graphify_home(tmp_path): + """The configured graphify home dir is treated as noise.""" + root = _project(tmp_path) + _write(root / ".graphify" / "graph.json") + _write(root / "src" / "main.py") + rel = _detected(root) + assert all(not f.startswith(".graphify/") for f in rel) + assert "src/main.py" in rel + + +def test_user_negation_overrides_builtin_noise_dotdir(tmp_path): + """A user `!`-rule on a dotdir rescues it from the built-in noise spec.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("!.config/\n", encoding="utf-8") + _write(root / ".config" / "settings.py") + rel = _detected(root) + assert ".config/settings.py" in rel + + +def test_user_negation_overrides_builtin_noise_venv(tmp_path): + """A user `!`-rule rescues a recognized file inside a built-in-noise dir.""" + root = _project(tmp_path) + (root / ".gitignore").write_text("!venv/\n", encoding="utf-8") + _write(root / "venv" / "kept.py") + rel = _detected(root) + assert "venv/kept.py" in rel diff --git a/tests/test_install.py b/tests/test_install.py index e94e3086..eb72e5c2 100644 --- a/tests/test_install.py +++ b/tests/test_install.py @@ -70,21 +70,21 @@ def test_install_unknown_platform_exits(tmp_path): def test_codex_skill_contains_spawn_agent(): """Codex skill file must reference spawn_agent.""" import graphify - skill = (Path(graphify.__file__).parent / "skill-codex.md").read_text() + skill = (Path(graphify.__file__).parent / "skill-codex.md").read_text(encoding="utf-8") assert "spawn_agent" in skill def test_opencode_skill_contains_mention(): """OpenCode skill file must reference @mention.""" import graphify - skill = (Path(graphify.__file__).parent / "skill-opencode.md").read_text() + skill = (Path(graphify.__file__).parent / "skill-opencode.md").read_text(encoding="utf-8") assert "@mention" in skill def test_claw_skill_is_sequential(): """OpenClaw skill file must describe sequential extraction.""" import graphify - skill = (Path(graphify.__file__).parent / "skill-claw.md").read_text() + skill = (Path(graphify.__file__).parent / "skill-claw.md").read_text(encoding="utf-8") assert "sequential" in skill.lower() assert "spawn_agent" not in skill assert "@mention" not in skill @@ -238,7 +238,8 @@ def test_cursor_install_writes_rule(tmp_path): assert rule.exists() content = rule.read_text() assert "alwaysApply: true" in content - assert "graphify-out/GRAPH_REPORT.md" in content + from graphify import paths as _paths + assert f"{_paths.home_name()}/GRAPH_REPORT.md" in content def test_cursor_install_idempotent(tmp_path): @@ -266,6 +267,28 @@ def test_cursor_uninstall_noop_if_not_installed(tmp_path): _cursor_uninstall(tmp_path) # should not raise +def test_install_auto_migrates_legacy_layout(tmp_path): + """Running an install command on a project with legacy graphify-out/ + layout auto-renames it to the configured home dir, so the CLAUDE.md / + skill / hook rewrites that the install does match the on-disk dir. + """ + (tmp_path / "graphify-out").mkdir() + (tmp_path / "graphify-out" / "graph.json").write_text("{}", encoding="utf-8") + from graphify.__main__ import _cursor_install + _cursor_install(tmp_path) + assert not (tmp_path / "graphify-out").exists() + assert (tmp_path / ".graphify" / "graph.json").is_file() + + +def test_install_does_not_migrate_when_no_legacy(tmp_path): + """Fresh installs (no graphify-out/) leave the filesystem untouched + apart from what the install itself writes.""" + from graphify.__main__ import _cursor_install + _cursor_install(tmp_path) + assert not (tmp_path / "graphify-out").exists() + assert not (tmp_path / ".graphify").exists() + + # ── Gemini CLI ──────────────────────────────────────────────────────────────── def test_gemini_install_writes_gemini_md(tmp_path): @@ -273,7 +296,8 @@ def test_gemini_install_writes_gemini_md(tmp_path): gemini_install(tmp_path) md = tmp_path / "GEMINI.md" assert md.exists() - assert "graphify-out/GRAPH_REPORT.md" in md.read_text() + from graphify import paths as _paths + assert f"{_paths.home_name()}/GRAPH_REPORT.md" in md.read_text() def test_gemini_install_writes_hook(tmp_path): import json as _json @@ -296,7 +320,8 @@ def test_gemini_install_merges_existing_gemini_md(tmp_path): gemini_install(tmp_path) content = (tmp_path / "GEMINI.md").read_text() assert "# My project rules" in content - assert "graphify-out/GRAPH_REPORT.md" in content + from graphify import paths as _paths + assert f"{_paths.home_name()}/GRAPH_REPORT.md" in content def test_gemini_uninstall_removes_section(tmp_path): from graphify.__main__ import gemini_install, gemini_uninstall diff --git a/tests/test_migrate.py b/tests/test_migrate.py new file mode 100644 index 00000000..39170731 --- /dev/null +++ b/tests/test_migrate.py @@ -0,0 +1,112 @@ +"""Tests for the `graphify migrate-home` CLI command.""" +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +GRAPHIFY_ROOT = Path(__file__).resolve().parent.parent + + +def _run_cli(args: list[str], cwd: Path, env_extra: dict | None = None) -> subprocess.CompletedProcess: + """Invoke `python -m graphify ` with PYTHONPATH pointing at the repo.""" + env = os.environ.copy() + env["PYTHONPATH"] = str(GRAPHIFY_ROOT) + env.pop("GRAPHIFY_HOME", None) + if env_extra: + env.update(env_extra) + return subprocess.run( + [sys.executable, "-m", "graphify", *args], + cwd=cwd, + env=env, + capture_output=True, + text=True, + encoding="utf-8", + ) + + +def test_migrate_no_legacy_dir_is_noop(tmp_path): + """Running migrate when there's no graphify-out/ prints a friendly message.""" + result = _run_cli(["migrate-home"], cwd=tmp_path) + assert result.returncode == 0 + assert "nothing to migrate" in result.stdout.lower() + + +def test_migrate_renames_legacy_to_default(tmp_path): + """graphify-out/ is renamed to .graphify/ when the new dir doesn't exist yet.""" + legacy = tmp_path / "graphify-out" + legacy.mkdir() + (legacy / "graph.json").write_text("{}", encoding="utf-8") + (legacy / "cache").mkdir() + (legacy / "cache" / "abc.json").write_text("{}", encoding="utf-8") + + result = _run_cli(["migrate-home"], cwd=tmp_path) + assert result.returncode == 0, result.stderr + assert not legacy.exists() + new_home = tmp_path / ".graphify" + assert (new_home / "graph.json").is_file() + assert (new_home / "cache" / "abc.json").is_file() + + +def test_migrate_dry_run_does_not_move(tmp_path): + """--dry-run announces the rename but leaves the filesystem untouched.""" + legacy = tmp_path / "graphify-out" + legacy.mkdir() + (legacy / "graph.json").write_text("{}", encoding="utf-8") + + result = _run_cli(["migrate-home", "--dry-run"], cwd=tmp_path) + assert result.returncode == 0, result.stderr + assert "[dry-run]" in result.stdout + assert legacy.exists() + assert not (tmp_path / ".graphify").exists() + + +def test_migrate_refuses_when_target_exists(tmp_path): + """Without --force we refuse to clobber an existing .graphify/.""" + (tmp_path / "graphify-out").mkdir() + (tmp_path / ".graphify").mkdir() + + result = _run_cli(["migrate-home"], cwd=tmp_path) + assert result.returncode == 2 + assert "refusing to overwrite" in result.stderr.lower() + + +def test_migrate_force_merges(tmp_path): + """--force merges legacy into target without overwriting target files.""" + legacy = tmp_path / "graphify-out" + new = tmp_path / ".graphify" + legacy.mkdir() + new.mkdir() + (legacy / "old-only.txt").write_text("old", encoding="utf-8") + (legacy / "shared.txt").write_text("legacy version", encoding="utf-8") + (new / "shared.txt").write_text("new version", encoding="utf-8") + (new / "new-only.txt").write_text("new", encoding="utf-8") + + result = _run_cli(["migrate-home", "--force"], cwd=tmp_path) + assert result.returncode == 0, result.stderr + assert not legacy.exists() + assert (new / "old-only.txt").read_text(encoding="utf-8") == "old" + assert (new / "new-only.txt").read_text(encoding="utf-8") == "new" + # Conflict resolution: target wins. + assert (new / "shared.txt").read_text(encoding="utf-8") == "new version" + + +def test_migrate_respects_env_target(tmp_path): + """If GRAPHIFY_HOME points elsewhere, migrate moves into that name.""" + legacy = tmp_path / "graphify-out" + legacy.mkdir() + (legacy / "graph.json").write_text("{}", encoding="utf-8") + + result = _run_cli( + ["migrate-home"], + cwd=tmp_path, + env_extra={"GRAPHIFY_HOME": "build-graph"}, + ) + assert result.returncode == 0, result.stderr + assert not legacy.exists() + assert (tmp_path / "build-graph" / "graph.json").is_file() + # Default name should NOT be created when env is set. + assert not (tmp_path / ".graphify").exists() diff --git a/tests/test_paths.py b/tests/test_paths.py new file mode 100644 index 00000000..6607d36d --- /dev/null +++ b/tests/test_paths.py @@ -0,0 +1,148 @@ +"""Tests for graphify/paths.py — configurable home directory.""" +from __future__ import annotations + +import os +from pathlib import Path + +import pytest + +from graphify import paths + + +@pytest.fixture(autouse=True) +def _isolate_env(monkeypatch): + """Ensure GRAPHIFY_HOME doesn't leak between tests.""" + monkeypatch.delenv(paths.ENV_HOME, raising=False) + yield + + +def test_default_home_name_is_dotgraphify(): + """Fresh-install default is .graphify (no env var, no legacy dir).""" + assert paths.home_name() == ".graphify" + assert paths.DEFAULT_HOME_NAME == ".graphify" + + +def test_legacy_home_name_constant(): + """LEGACY_HOME_NAME exposes the old default for migration tooling.""" + assert paths.LEGACY_HOME_NAME == "graphify-out" + + +def test_env_override(monkeypatch): + """GRAPHIFY_HOME env var fully overrides the default name.""" + monkeypatch.setenv(paths.ENV_HOME, "graphify-out") + assert paths.home_name() == "graphify-out" + + monkeypatch.setenv(paths.ENV_HOME, ".my-graph") + assert paths.home_name() == ".my-graph" + + +def test_blank_env_falls_back_to_default(monkeypatch): + """Empty/whitespace-only env values are ignored (treated as unset).""" + monkeypatch.setenv(paths.ENV_HOME, "") + assert paths.home_name() == ".graphify" + + monkeypatch.setenv(paths.ENV_HOME, " ") + assert paths.home_name() == ".graphify" + + +def test_home_resolves_under_root(tmp_path): + """home(root) returns an absolute path under the given root.""" + h = paths.home(tmp_path) + assert h == (tmp_path / ".graphify").resolve() + + +def test_cache_dir_creates_and_resolves(tmp_path): + """cache_dir() resolves to /cache and creates it by default.""" + cd = paths.cache_dir(tmp_path) + assert cd == (tmp_path / ".graphify" / "cache").resolve() + assert cd.is_dir() + + +def test_cache_dir_no_create(tmp_path): + """cache_dir(create=False) does NOT create the directory.""" + cd = paths.cache_dir(tmp_path, create=False) + assert not cd.exists() + + +def test_subpath_helpers(tmp_path): + """All sub-path helpers compose against the resolved home dir.""" + base = (tmp_path / ".graphify").resolve() + assert paths.manifest_path(tmp_path) == base / "manifest.json" + assert paths.memory_dir(tmp_path) == base / "memory" + assert paths.converted_dir(tmp_path) == base / "converted" + assert paths.graph_path(tmp_path) == base / "graph.json" + assert paths.report_path(tmp_path) == base / "GRAPH_REPORT.md" + assert paths.cost_path(tmp_path) == base / "cost.json" + assert paths.needs_update_path(tmp_path) == base / "needs_update" + + +def test_subpath_helpers_follow_env(tmp_path, monkeypatch): + """Changing GRAPHIFY_HOME at runtime changes every helper.""" + monkeypatch.setenv(paths.ENV_HOME, "build-graph") + base = (tmp_path / "build-graph").resolve() + assert paths.home(tmp_path) == base + assert paths.cache_dir(tmp_path).parent == base + assert paths.graph_path(tmp_path) == base / "graph.json" + + +def test_has_legacy_layout_true(tmp_path): + """has_legacy_layout returns True when graphify-out exists and .graphify doesn't.""" + (tmp_path / "graphify-out").mkdir() + assert paths.has_legacy_layout(tmp_path) is True + + +def test_has_legacy_layout_false_when_both_exist(tmp_path): + """If both legacy and current dirs exist, no migration needed (or pending).""" + (tmp_path / "graphify-out").mkdir() + (tmp_path / ".graphify").mkdir() + assert paths.has_legacy_layout(tmp_path) is False + + +def test_has_legacy_layout_false_when_no_legacy(tmp_path): + """No legacy dir → nothing to migrate.""" + assert paths.has_legacy_layout(tmp_path) is False + + +def test_has_legacy_layout_false_when_env_set(tmp_path, monkeypatch): + """If user pinned GRAPHIFY_HOME explicitly, leave them alone.""" + (tmp_path / "graphify-out").mkdir() + monkeypatch.setenv(paths.ENV_HOME, "graphify-out") + assert paths.has_legacy_layout(tmp_path) is False + + +# --------------------------------------------------------------------------- +# auto_migrate — invoked at the start of every install entry point so an +# upgrading user's directory rename happens in lockstep with the CLAUDE.md / +# hook / skill-file rewrites those install commands trigger. +# --------------------------------------------------------------------------- + + +def test_auto_migrate_renames_legacy_dir(tmp_path): + """Bare graphify-out/ next to no .graphify/ → renamed to .graphify/.""" + (tmp_path / "graphify-out").mkdir() + (tmp_path / "graphify-out" / "graph.json").write_text("{}", encoding="utf-8") + assert paths.auto_migrate(tmp_path) is True + assert not (tmp_path / "graphify-out").exists() + assert (tmp_path / ".graphify" / "graph.json").is_file() + + +def test_auto_migrate_noop_no_legacy(tmp_path): + """No legacy dir → nothing to do.""" + assert paths.auto_migrate(tmp_path) is False + + +def test_auto_migrate_noop_when_target_already_exists(tmp_path): + """Both layouts present → conservative refusal; user runs migrate-home --force.""" + (tmp_path / "graphify-out").mkdir() + (tmp_path / ".graphify").mkdir() + assert paths.auto_migrate(tmp_path) is False + assert (tmp_path / "graphify-out").exists() + assert (tmp_path / ".graphify").exists() + + +def test_auto_migrate_noop_when_env_set(tmp_path, monkeypatch): + """User pinned GRAPHIFY_HOME → leave the existing layout untouched.""" + (tmp_path / "graphify-out").mkdir() + monkeypatch.setenv(paths.ENV_HOME, "graphify-out") + assert paths.auto_migrate(tmp_path) is False + assert (tmp_path / "graphify-out").exists() diff --git a/tests/test_watch.py b/tests/test_watch.py index ac396aa6..60eda6f6 100644 --- a/tests/test_watch.py +++ b/tests/test_watch.py @@ -3,6 +3,7 @@ from pathlib import Path import pytest +from graphify import paths as _paths from graphify.watch import _notify_only, _WATCHED_EXTENSIONS @@ -10,20 +11,20 @@ def test_notify_only_creates_flag(tmp_path): _notify_only(tmp_path) - flag = tmp_path / "graphify-out" / "needs_update" + flag = _paths.needs_update_path(tmp_path) assert flag.exists() assert flag.read_text() == "1" def test_notify_only_creates_flag_dir(tmp_path): - # graphify-out dir does not exist yet - assert not (tmp_path / "graphify-out").exists() + # home dir does not exist yet + assert not _paths.home(tmp_path).exists() _notify_only(tmp_path) - assert (tmp_path / "graphify-out").is_dir() + assert _paths.home(tmp_path).is_dir() def test_notify_only_idempotent(tmp_path): _notify_only(tmp_path) _notify_only(tmp_path) - flag = tmp_path / "graphify-out" / "needs_update" + flag = _paths.needs_update_path(tmp_path) assert flag.read_text() == "1" @@ -61,7 +62,7 @@ def test_check_update_no_flag_returns_true(tmp_path): def test_check_update_with_flag_returns_true_and_prints(tmp_path, capsys): """check_update returns True and prints notification when flag exists.""" from graphify.watch import check_update - flag = tmp_path / "graphify-out" / "needs_update" + flag = _paths.needs_update_path(tmp_path) flag.parent.mkdir(parents=True, exist_ok=True) flag.write_text("1") result = check_update(tmp_path) @@ -73,7 +74,7 @@ def test_check_update_with_flag_returns_true_and_prints(tmp_path, capsys): def test_check_update_does_not_clear_flag(tmp_path): """check_update never removes the needs_update flag (clearing is LLM's job).""" from graphify.watch import check_update - flag = tmp_path / "graphify-out" / "needs_update" + flag = _paths.needs_update_path(tmp_path) flag.parent.mkdir(parents=True, exist_ok=True) flag.write_text("1") check_update(tmp_path)