Add corpus query command and topology morphology classification

Your Name · claude · Your Name · commit fbbf902e41f0 · 2026-05-24T22:30:18.000+08:00
- classify_topology(stats) — heuristic structural phenotype detection:
  dominant_chain, multi_root_exploration, fan_out_heavy,
  collapsed_repair, mixed
- TOPOLOGY_PHENOTYPES — description dict for CLI filter choices
- causetrace corpus [--runtime] [--task] [--topology] [--source] —
  filter/query the session corpus with per-session topology stats
- 5 new tests for classify_topology phenotypes

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/AGENTS.md b/AGENTS.md
@@ -36,7 +36,7 @@ Recent history uses concise imperative subjects such as `Add ...`, `Fix ...`, `U
 <!-- gitnexus:start -->
 # GitNexus — Code Intelligence
 
-This project is indexed by GitNexus as **causetrace** (1733 symbols, 2949 relationships, 111 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
+This project is indexed by GitNexus as **causetrace** (1738 symbols, 2954 relationships, 113 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
 
 > If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
 
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -150,7 +150,7 @@ python3 tools/promote.py devto-post docs/promotion/blog_<topic>.md
 <!-- gitnexus:start -->
 # GitNexus — Code Intelligence
 
-This project is indexed by GitNexus as **causetrace** (1733 symbols, 2949 relationships, 111 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
+This project is indexed by GitNexus as **causetrace** (1738 symbols, 2954 relationships, 113 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely.
 
 > If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first.
 
diff --git a/causetrace/analysis.py b/causetrace/analysis.py
@@ -8,6 +8,9 @@
     transition_entropy, branch_density, root_spawning_rate,
     path_reuse_ratio
 
+Layer 1.3 — Morphology (structural phenotype classification):
+    classify_topology
+
 Layer 2 — Pattern (repeated structures, no semantic interpretation):
     detect_repeated_paths, detect_common_transitions,
     detect_fan_in_patterns, detect_branch_collapse
@@ -489,6 +492,74 @@ def path_reuse_ratio(events, max_depth: int = 10) -> dict:
     }
 
 
+# ---------------------------------------------------------------------------
+# Layer 1.3 — Morphology classification
+# ---------------------------------------------------------------------------
+
+def classify_topology(stats: dict) -> str:
+    """Classify session topology into a structural phenotype.
+
+    Heuristic-only, no semantic interpretation.  Uses ``compute_stats`` output.
+
+    Phenotypes:
+
+    ``dominant_chain``
+        Single (or near-single) root, deep relative to size, low branching.
+        Typical of linear fix-and-test loops.
+
+    ``multi_root_exploration``
+        Many roots, shallow depth, low reuse.
+        Typical of reading/searching/discovery behaviour.
+
+    ``fan_out_heavy``
+        One or few roots with wide branching, moderate depth.
+        Typical of parallel task spawning.
+
+    ``collapsed_repair``
+        Significant fan-in / multi-parent convergence.
+        Typical of iterative refinement converging on a target.
+
+    ``mixed``
+        No phenotype clearly dominates.
+    """
+    rc = stats.get("root_count", 0)
+    mc = stats.get("event_count", 1)
+    depth = stats.get("max_depth", 0)
+    avg_depth = stats.get("avg_depth", 0.0)
+    fan_out_avg = stats.get("fan_out_avg", 0.0)
+    fan_out_max = stats.get("fan_out_max", 0)
+    multi_parent = stats.get("multi_parent_count", 0)
+
+    depth_ratio = depth / mc if mc > 0 else 0
+
+    # dominant_chain — few roots, deep relative to size
+    if rc <= 2 and depth_ratio > 0.3 and fan_out_avg < 1.5:
+        return "dominant_chain"
+
+    # fan_out_heavy — wide branching from few roots
+    if rc <= 3 and fan_out_max >= 4 and fan_out_avg >= 1.5:
+        return "fan_out_heavy"
+
+    # collapsed_repair — significant multi-parent convergence
+    if multi_parent >= 3 and multi_parent / mc > 0.05:
+        return "collapsed_repair"
+
+    # multi_root_exploration — many roots, shallow
+    if rc >= 5 and avg_depth < 3:
+        return "multi_root_exploration"
+
+    return "mixed"
+
+
+TOPOLOGY_PHENOTYPES = {
+    "dominant_chain": "Single-chain deep topology, low branching",
+    "multi_root_exploration": "Many shallow roots, exploration-like",
+    "fan_out_heavy": "Wide branching from few roots",
+    "collapsed_repair": "Significant multi-parent convergence",
+    "mixed": "No dominant structural phenotype",
+}
+
+
 # ---------------------------------------------------------------------------
 # Layer 1.5 — Temporal partitioning primitive (no semantics, no state naming)
 # ---------------------------------------------------------------------------
diff --git a/causetrace/cli.py b/causetrace/cli.py
@@ -12,7 +12,8 @@
 from .analysis import (
     compute_stats, find_roots, longest_path, fan_out_distribution,
     connected_components, detect_repeated_paths, detect_common_transitions,
-    detect_fan_in_patterns, detect_branch_collapse,
+    detect_fan_in_patterns, detect_branch_collapse, classify_topology,
+    TOPOLOGY_PHENOTYPES,
 )
 from .annotation import load_annotation, save_annotation, list_annotated, list_unannotated, TASK_TYPES, SOURCES
 from .causality import causal_quality_report
@@ -234,6 +235,12 @@ def cli(argv: list[str] | None = None) -> None:
     p_an.add_argument("--list", action="store_true", dest="_list", help="List all annotated sessions")
     p_an.add_argument("--unannotated", action="store_true", help="List sessions without annotations")
 
+    p_cr = sub.add_parser("corpus", help="Query and filter session corpus")
+    p_cr.add_argument("--runtime", help="Filter by runtime (e.g. claude, codex, opencode)")
+    p_cr.add_argument("--task", choices=list(TASK_TYPES), help="Filter by task type")
+    p_cr.add_argument("--topology", choices=list(TOPOLOGY_PHENOTYPES), help="Filter by topology phenotype")
+    p_cr.add_argument("--source", choices=list(SOURCES), help="Filter by session source")
+
     p_cmp = sub.add_parser("compare", help="Compare two sessions side by side")
     p_cmp.add_argument("session_a", help="First session ID")
     p_cmp.add_argument("session_b", help="Second session ID")
@@ -563,6 +570,9 @@ def _load(sid: str | None):
     elif args.command == "annotate":
         _handle_annotate(store, args)
 
+    elif args.command == "corpus":
+        _handle_corpus(store, args)
+
     elif args.command == "compare":
         _handle_compare(store, args)
 
@@ -880,6 +890,62 @@ def _handle_annotate(store, args) -> None:
         print(f"  {k}: {v}")
 
 
+def _handle_corpus(store, args) -> None:
+    """Handle ``causetrace corpus``."""
+    sids = store.list_sessions()
+    if not sids:
+        print("No sessions found.")
+        return
+
+    rows = []
+    for sid in sids:
+        annotation = load_annotation(sid)
+        runtime = annotation.get("runtime", annotation.get("agent", "")) or ""
+        task = annotation.get("task_type", "") or ""
+        source = annotation.get("source", "") or ""
+        topology = annotation.get("topology", "") or ""
+
+        events = store.load(sid)
+        stats = compute_stats(events) if events else {}
+        if not topology:
+            topology = classify_topology(stats)
+        topology = topology or ""
+
+        rows.append({
+            "session_id": sid,
+            "runtime": runtime,
+            "task": task,
+            "topology": topology,
+            "events": stats.get("event_count", 0),
+            "depth": stats.get("max_depth", 0),
+            "roots": stats.get("root_count", 0),
+            "source": source,
+        })
+
+    # Filter
+    if args.runtime:
+        rows = [r for r in rows if args.runtime.lower() in r["runtime"].lower()]
+    if args.task:
+        rows = [r for r in rows if r["task"] == args.task]
+    if args.topology:
+        rows = [r for r in rows if r["topology"] == args.topology]
+    if args.source:
+        rows = [r for r in rows if r["source"] == args.source]
+
+    if not rows:
+        print("No matching sessions.")
+        return
+
+    # Print table
+    header = f"{'Session ID':24s}  {'Runtime':12s}  {'Task':14s}  {'Topology':22s}  {'Events':>6s}  {'Depth':>5s}  {'Roots':>5s}"
+    print(f"Corpus: {len(rows)} session(s)\n")
+    print(header)
+    print("-" * len(header))
+    for r in rows:
+        sid = r["session_id"][:22]
+        print(f"{sid:24s}  {r['runtime']:12s}  {r['task']:14s}  {r['topology']:22s}  {r['events']:6d}  {r['depth']:5d}  {r['roots']:5d}")
+
+
 def _handle_compare(store, args) -> None:
     """Handle `causetrace compare`."""
     sid_a = args.session_a
diff --git a/tests/test_dag_fixtures.py b/tests/test_dag_fixtures.py
@@ -18,7 +18,7 @@
     compute_stats, find_roots, longest_path, connected_components,
     detect_common_transitions, detect_fan_in_patterns, detect_repeated_paths,
     windowed, transition_entropy, branch_density, root_spawning_rate,
-    path_reuse_ratio,
+    path_reuse_ratio, classify_topology,
 )
 
 FIXTURE_DIR = Path(__file__).resolve().parent / "fixtures" / "dags"
@@ -456,6 +456,39 @@ def test_path_reuse_ratio_empty():
     assert r["total_paths"] == 0
 
 
+# ── Topology classification ──
+
+def test_classify_topology_dominant_chain():
+    stats = {"root_count": 1, "event_count": 100, "max_depth": 80,
+             "avg_depth": 40.0, "fan_out_avg": 0.5, "fan_out_max": 1,
+             "multi_parent_count": 0}
+    assert classify_topology(stats) == "dominant_chain"
+
+def test_classify_topology_multi_root():
+    stats = {"root_count": 12, "event_count": 50, "max_depth": 2,
+             "avg_depth": 0.8, "fan_out_avg": 0.3, "fan_out_max": 2,
+             "multi_parent_count": 0}
+    assert classify_topology(stats) == "multi_root_exploration"
+
+def test_classify_topology_fan_out_heavy():
+    stats = {"root_count": 1, "event_count": 50, "max_depth": 3,
+             "avg_depth": 1.5, "fan_out_avg": 3.0, "fan_out_max": 20,
+             "multi_parent_count": 0}
+    assert classify_topology(stats) == "fan_out_heavy"
+
+def test_classify_topology_collapsed():
+    stats = {"root_count": 2, "event_count": 100, "max_depth": 10,
+             "avg_depth": 4.0, "fan_out_avg": 0.8, "fan_out_max": 3,
+             "multi_parent_count": 8}
+    assert classify_topology(stats) == "collapsed_repair"
+
+def test_classify_topology_mixed():
+    stats = {"root_count": 4, "event_count": 100, "max_depth": 15,
+             "avg_depth": 5.0, "fan_out_avg": 0.9, "fan_out_max": 3,
+             "multi_parent_count": 1}
+    assert classify_topology(stats) == "mixed"
+
+
 # ── Invariant battery (parametrized over fixtures) ──
 
 INVARIANT_FIXTURES = [