Skip to content

Commit 1794a6d

Browse files
author
Your Name
committed
Add CERC plan validation
1 parent 8a050d6 commit 1794a6d

6 files changed

Lines changed: 294 additions & 0 deletions

File tree

causetrace/cli.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
compile_subsets,
2626
ingest_feedback,
2727
plan_experiments,
28+
validate_experiment_plan,
2829
reprioritize_experiments,
2930
update_gaps,
3031
)
@@ -365,6 +366,10 @@ def cli(argv: list[str] | None = None) -> None:
365366
p_cr_reprioritize.add_argument("input", help="Feedback report JSON path")
366367
p_cr_reprioritize.add_argument("--output-dir", help="Output directory (default: docs/research/dataset_design/feedback)")
367368
p_cr_reprioritize.add_argument("--json", action="store_true", help="Print reprioritized plan as JSON")
369+
p_cr_validate = p_cr_sub.add_parser("validate-plan", help="Validate a CERC experiment plan without executing it")
370+
p_cr_validate.add_argument("plan_dir", help="Experiment plan directory")
371+
p_cr_validate.add_argument("--output-dir", help="Output directory (default: docs/research/dataset_design/plan_validation)")
372+
p_cr_validate.add_argument("--json", action="store_true", help="Print full validation report as JSON")
368373

369374
p_cmp = sub.add_parser("compare", help="Compare two sessions side by side")
370375
p_cmp.add_argument("session_a", help="First session ID")
@@ -1876,6 +1881,27 @@ def _handle_corpus(store, args) -> None:
18761881
print(f" Top priority: {report['priorities'][0]['subset_id'] if report['priorities'] else 'none'}")
18771882
return
18781883

1884+
if args.corpus_command == "validate-plan":
1885+
report = validate_experiment_plan(
1886+
store,
1887+
plan_dir=args.plan_dir,
1888+
output_dir=args.output_dir,
1889+
write=True,
1890+
)
1891+
if args.json:
1892+
json.dump(report, sys.stdout, indent=2)
1893+
print()
1894+
return
1895+
print(f"Plan validation: {report['output_dir']}")
1896+
print(f" Plan dir: {report['plan_dir']}")
1897+
print(f" Target subset: {report['target_subset']}")
1898+
print(f" Required sessions: {report['required_sessions']}")
1899+
print(f" Missing sessions: {report['necessity']['missing_sessions']}")
1900+
print(f" Duplicate plans: {len(report['duplicate_plans'])}")
1901+
print(f" Validation ok: {report['validation']['ok']}")
1902+
print(f" Status: {report['validation']['status']}")
1903+
return
1904+
18791905
if args.corpus_command == "verify":
18801906
result = verify_snapshot(args.snapshot_dir)
18811907
print(f"Snapshot: {result['snapshot_dir']}")

causetrace/crdd/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .feedback import ingest_feedback, reprioritize_experiments, update_gaps
99
from .experiment_planner import plan_experiments
1010
from .gap_analyzer import analyze_gaps
11+
from .plan_validation import validate_experiment_plan
1112
from .subset_builder import build_subset, compile_subsets
1213
from .subset_registry import SUBSET_DEFINITIONS, get_subset_definition
1314

@@ -20,6 +21,7 @@
2021
"ingest_feedback",
2122
"get_subset_definition",
2223
"plan_experiments",
24+
"validate_experiment_plan",
2325
"reprioritize_experiments",
2426
"update_gaps",
2527
]

causetrace/crdd/plan_validation.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
"""CERC plan validation helpers.
2+
3+
This layer validates experiment plans without executing them. It checks for
4+
queue integrity, duplicate plan signatures, and whether the requested sampling
5+
is still needed.
6+
"""
7+
from __future__ import annotations
8+
9+
import hashlib
10+
import json
11+
from datetime import datetime
12+
from pathlib import Path
13+
from typing import Any
14+
15+
from causetrace.core import JSONStore
16+
17+
from .constraints import validate_execution_queue
18+
from .experiment_planner import DEFAULT_PLAN_OUTPUT_DIR
19+
from .gap_analyzer import analyze_gaps
20+
from .subset_registry import SUBSET_DEFINITIONS
21+
22+
23+
DEFAULT_PLAN_VALIDATION_OUTPUT_DIR = Path.home() / ".causetrace" / "plan_validation"
24+
25+
26+
def _load_json(path: Path) -> dict[str, Any]:
27+
data = json.loads(path.read_text(encoding="utf-8"))
28+
if not isinstance(data, dict):
29+
raise ValueError(f"{path.name} must contain a JSON object")
30+
return data
31+
32+
33+
def _canonicalize_queue(queue: dict[str, Any]) -> dict[str, Any]:
34+
def _clean(value: Any) -> Any:
35+
if isinstance(value, dict):
36+
cleaned: dict[str, Any] = {}
37+
for key, item in value.items():
38+
if key in {"experiment_id", "generated_at", "output_dir", "queue_hash", "validation"}:
39+
continue
40+
cleaned[key] = _clean(item)
41+
return cleaned
42+
if isinstance(value, list):
43+
return [_clean(item) for item in value]
44+
return value
45+
46+
return _clean(queue)
47+
48+
49+
def _queue_signature(queue: dict[str, Any]) -> str:
50+
canonical = _canonicalize_queue(queue)
51+
encoded = json.dumps(canonical, sort_keys=True, separators=(",", ":"))
52+
return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
53+
54+
55+
def _scan_duplicate_plans(plan_root: Path, signature: str, current_plan_dir: Path) -> list[str]:
56+
duplicates: list[str] = []
57+
if not plan_root.exists():
58+
return duplicates
59+
for queue_path in plan_root.rglob("experiment_queue.json"):
60+
if queue_path.parent == current_plan_dir:
61+
continue
62+
try:
63+
queue = _load_json(queue_path)
64+
except Exception:
65+
continue
66+
if _queue_signature(queue) == signature:
67+
duplicates.append(str(queue_path.parent))
68+
return sorted(duplicates)
69+
70+
71+
def validate_experiment_plan(
72+
store: JSONStore,
73+
*,
74+
plan_dir: str | Path,
75+
output_dir: str | Path | None = None,
76+
write: bool = True,
77+
) -> dict[str, Any]:
78+
"""Validate an experiment plan without executing it."""
79+
plan_path = Path(plan_dir)
80+
queue_path = plan_path / "experiment_queue.json"
81+
gap_path = plan_path / "gap_report.json"
82+
if not queue_path.exists():
83+
raise FileNotFoundError(f"missing plan queue: {queue_path}")
84+
85+
queue = _load_json(queue_path)
86+
gap_report = _load_json(gap_path) if gap_path.exists() else {}
87+
target_subset = str(queue.get("target_subset") or gap_report.get("target_subset") or "unknown")
88+
if target_subset in SUBSET_DEFINITIONS:
89+
current_gap = analyze_gaps(store, subset_ids=[target_subset])["subset_gaps"][0]
90+
else:
91+
current_gap = None
92+
93+
constraint_check = validate_execution_queue(queue)
94+
signature = _queue_signature(queue)
95+
plan_root = plan_path.parent if plan_path.parent != plan_path else DEFAULT_PLAN_OUTPUT_DIR
96+
duplicate_plans = _scan_duplicate_plans(plan_root, signature, plan_path)
97+
required_sessions = int(queue.get("required_sessions", 0) or 0)
98+
missing_sessions = int((current_gap or {}).get("missing_sessions", required_sessions))
99+
needed = missing_sessions > 0 and required_sessions > 0
100+
valid = constraint_check["ok"] and not duplicate_plans and needed
101+
102+
report: dict[str, Any] = {
103+
"schema": "causetrace.cerc.plan_validation.v0.1",
104+
"generated_at": datetime.now().isoformat(),
105+
"plan_dir": str(plan_path),
106+
"target_subset": target_subset,
107+
"required_sessions": required_sessions,
108+
"current_gap": current_gap,
109+
"gap_report": gap_report,
110+
"queue_signature": signature,
111+
"duplicate_plans": duplicate_plans,
112+
"constraint_check": constraint_check,
113+
"necessity": {
114+
"missing_sessions": missing_sessions,
115+
"sampling_needed": needed,
116+
},
117+
"validation": {
118+
"ok": valid,
119+
"status": "ready" if valid else ("duplicate" if duplicate_plans else "not_needed"),
120+
},
121+
"constraints": {
122+
"external_only": True,
123+
"no_execution": True,
124+
"no_evidence_inflation": True,
125+
"no_phase4_grade_promotion": True,
126+
},
127+
}
128+
129+
if write:
130+
root = Path(output_dir) if output_dir else DEFAULT_PLAN_VALIDATION_OUTPUT_DIR
131+
run_dir = root / plan_path.name
132+
run_dir.mkdir(parents=True, exist_ok=True)
133+
(run_dir / "plan_validation.json").write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
134+
(run_dir / "plan_validation.md").write_text(
135+
"\n".join([
136+
f"# Plan validation: {plan_path.name}",
137+
"",
138+
f"- target subset: `{target_subset}`",
139+
f"- required sessions: `{required_sessions}`",
140+
f"- missing sessions: `{missing_sessions}`",
141+
f"- sampling needed: `{needed}`",
142+
f"- duplicate plans: `{len(duplicate_plans)}`",
143+
f"- validation ok: `{valid}`",
144+
f"- queue signature: `{signature}`",
145+
"",
146+
"## Safety Boundary",
147+
"",
148+
"Plan validation is read-only. It does not execute runtimes, alter evidence grades, or emit commands.",
149+
])
150+
+ "\n",
151+
encoding="utf-8",
152+
)
153+
report["output_dir"] = str(run_dir)
154+
else:
155+
report["output_dir"] = None
156+
157+
return report

docs/research/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ Phase 4 must not enter:
6969
- [Subset Manifest Template](dataset_design/subset_manifest_template.md): required structure for comparable and experimental subsets used in Phase 4 candidate revalidation.
7070
- [Causal Experiment Requirement Compiler v0.3](dataset_design/cerc_v0.3.md): experiment planning layer that turns observed subset gaps into external-only execution queues. CERC plans do not execute agents, inflate evidence, or upgrade Phase 4 grades.
7171
- [CERC Feedback Integration v0.4](dataset_design/feedback_v0.4.md): read-only feedback layer that ingests external execution results, updates gap projections, and reprioritizes future experiments without changing runtime authority.
72+
- [CERC Plan Validation v0.3.1](dataset_design/plan_validation_v0.3.1.md): read-only checker for duplicate plans, queue constraints, and whether a planned sampling gap still needs collection.
7273

7374
## Roadmap And Future Directions
7475

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# CERC Plan Validation v0.3.1
2+
3+
Plan validation checks whether a CERC experiment plan is still necessary and
4+
whether it remains safe under the external-only boundary. It is read-only and
5+
does not change corpus data or evidence grades.
6+
7+
## Definition
8+
9+
```text
10+
Input:
11+
plan directory with experiment_queue.json
12+
optional gap_report.json
13+
14+
Process:
15+
validate queue constraints
16+
compute canonical queue signature
17+
detect duplicate plans
18+
compare requested sampling against current gaps
19+
20+
Output:
21+
validation report and markdown summary
22+
```
23+
24+
## Commands
25+
26+
```bash
27+
causetrace corpus validate-plan docs/research/dataset_design/plans/<experiment_id>
28+
```
29+
30+
## Safety Boundary
31+
32+
Plan validation never executes runtimes, never emits commands, and never
33+
upgrades Phase 4 evidence. It only decides whether a proposed plan is ready,
34+
duplicated, or no longer needed.
35+
36+
## Relationship To CERC
37+
38+
CERC plans missing work. Validation decides whether that work is still needed
39+
and whether the plan already exists elsewhere in the corpus.

tests/test_metadata_corpus_report.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,3 +1188,72 @@ def test_cerc_feedback_cli_commands(monkeypatch, tmp_path):
11881188
)
11891189
assert reprioritize_cmd.returncode == 0
11901190
assert "Top priority:" in reprioritize_cmd.stdout
1191+
1192+
1193+
def test_cerc_plan_validation_detects_duplicates(monkeypatch, tmp_path):
1194+
import causetrace.metadata as metadata
1195+
from causetrace.crdd import plan_experiments, validate_experiment_plan
1196+
1197+
monkeypatch.setattr(metadata, "METADATA_DIR", str(tmp_path / "metadata"))
1198+
store = JSONStore(store_dir=str(tmp_path / "data"))
1199+
_write_session(store, "s1")
1200+
merge_metadata("s1", {"runtime": "codex", "task_type": "bug_fix", "task_source": "real_work", "success": False})
1201+
1202+
plan_a = plan_experiments(
1203+
store,
1204+
target_subset="failure_enriched",
1205+
required_sessions=5,
1206+
name="exp_plan_a",
1207+
output_dir=tmp_path / "plans",
1208+
)
1209+
plan_b = plan_experiments(
1210+
store,
1211+
target_subset="failure_enriched",
1212+
required_sessions=5,
1213+
name="exp_plan_b",
1214+
output_dir=tmp_path / "plans",
1215+
)
1216+
1217+
report = validate_experiment_plan(store, plan_dir=Path(plan_b["output_dir"]), output_dir=tmp_path / "plan-validation")
1218+
assert report["constraints"]["external_only"] is True
1219+
assert report["validation"]["status"] == "duplicate"
1220+
assert report["duplicate_plans"]
1221+
assert Path(report["output_dir"]).joinpath("plan_validation.json").exists()
1222+
assert Path(report["output_dir"]).joinpath("plan_validation.md").exists()
1223+
1224+
1225+
def test_cerc_plan_validation_cli(monkeypatch, tmp_path):
1226+
import causetrace.metadata as metadata
1227+
from causetrace.crdd import plan_experiments
1228+
1229+
monkeypatch.setattr(metadata, "METADATA_DIR", str(tmp_path / "metadata"))
1230+
store = JSONStore(store_dir=str(tmp_path / "data"))
1231+
_write_session(store, "s1")
1232+
merge_metadata("s1", {"runtime": "codex", "task_type": "bug_fix", "task_source": "real_work", "success": False})
1233+
1234+
plan_result = plan_experiments(
1235+
store,
1236+
target_subset="failure_enriched",
1237+
required_sessions=5,
1238+
name="exp_plan_cli",
1239+
output_dir=tmp_path / "plans",
1240+
)
1241+
1242+
cmd = subprocess.run(
1243+
[
1244+
sys.executable,
1245+
"-m",
1246+
"causetrace",
1247+
"corpus",
1248+
"validate-plan",
1249+
str(Path(plan_result["output_dir"])),
1250+
"--output-dir",
1251+
str(tmp_path / "plan-validation"),
1252+
],
1253+
capture_output=True,
1254+
text=True,
1255+
env={**os.environ, "HOME": str(tmp_path)},
1256+
)
1257+
assert cmd.returncode == 0
1258+
assert "Validation ok:" in cmd.stdout
1259+
assert "Status:" in cmd.stdout

0 commit comments

Comments
 (0)