Skip to content

Commit 0421783

Browse files
feat(agents): operator CLI, offline picker, release checker, README rewrite
Adds python -m yuholens.agents CLI for the 4-agent composer with --best-of-n / --judge-mode flags, scripts/run_bestofn_offline.py for heuristic-only picking with no API calls, and scripts/check_release_set.py to validate tokenizer + generation_config + arch before HF upload. Rewrites README.md with the KG-2 PASS headline, the metric arc, a mermaid 4-agent diagram, the cost table, and a sharper quickstart. 85 tests passing.
1 parent 5cbf2e7 commit 0421783

8 files changed

Lines changed: 981 additions & 107 deletions

File tree

README.md

Lines changed: 188 additions & 107 deletions
Large diffs are not rendered by default.

scripts/check_release_set.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
"""Pre-release artefact checker for a YuhoLens checkpoint directory.
2+
3+
Validates the invariants that gate a HuggingFace release:
4+
5+
1. Required tokenizer files are present.
6+
2. ``generation_config.json`` matches the v5 defaults
7+
(temperature 0.1, top_p 0.9, repetition_penalty 1.15,
8+
no_repeat_ngram_size 0). Run ``scripts/hf_upload.py --skip-upload``
9+
to repair this in place.
10+
3. Model weights are present (any of pytorch_model*.bin,
11+
model*.safetensors, or model.safetensors.index.json).
12+
4. ``config.json`` carries the expected base architecture
13+
(``QWenLMHeadModel``).
14+
15+
The exit code is 0 when every check passes, 1 otherwise. The script
16+
never modifies the checkpoint — repair is the operator's job.
17+
"""
18+
19+
from __future__ import annotations
20+
21+
import argparse
22+
import json
23+
import sys
24+
from pathlib import Path
25+
26+
V5_GENERATION_CONFIG: dict[str, object] = {
27+
"do_sample": True,
28+
"temperature": 0.1,
29+
"top_p": 0.9,
30+
"repetition_penalty": 1.15,
31+
"no_repeat_ngram_size": 0,
32+
}
33+
34+
REQUIRED_TOKENIZER_FILES: tuple[str, ...] = (
35+
"tokenizer_config.json",
36+
"tokenization_qwen.py",
37+
)
38+
39+
WEIGHT_GLOBS: tuple[str, ...] = (
40+
"pytorch_model*.bin",
41+
"model*.safetensors",
42+
"model.safetensors.index.json",
43+
)
44+
45+
46+
def check_tokenizer(model_path: Path) -> list[str]:
47+
"""Return a list of tokenizer files that are missing from ``model_path``."""
48+
return [name for name in REQUIRED_TOKENIZER_FILES if not (model_path / name).exists()]
49+
50+
51+
def check_weights(model_path: Path) -> bool:
52+
"""Return True when at least one weight artefact exists in ``model_path``."""
53+
for pattern in WEIGHT_GLOBS:
54+
if any(model_path.glob(pattern)):
55+
return True
56+
return False
57+
58+
59+
def check_generation_config(model_path: Path) -> tuple[bool, list[str]]:
60+
"""Return ``(ok, mismatches)`` for the v5 generation_config invariant."""
61+
config_path = model_path / "generation_config.json"
62+
if not config_path.exists():
63+
return False, ["generation_config.json missing"]
64+
config = json.loads(config_path.read_text(encoding="utf-8"))
65+
mismatches: list[str] = []
66+
for key, expected in V5_GENERATION_CONFIG.items():
67+
actual = config.get(key)
68+
if actual != expected:
69+
mismatches.append(f"{key}: expected {expected!r}, got {actual!r}")
70+
return not mismatches, mismatches
71+
72+
73+
def check_arch(model_path: Path) -> tuple[bool, str]:
74+
"""Return ``(ok, detail)`` for the expected Qwen1 architecture string."""
75+
config_path = model_path / "config.json"
76+
if not config_path.exists():
77+
return False, "config.json missing"
78+
config = json.loads(config_path.read_text(encoding="utf-8"))
79+
archs = config.get("architectures", []) or []
80+
if "QWenLMHeadModel" not in archs:
81+
return False, f"unexpected architectures={archs}"
82+
return True, ",".join(archs)
83+
84+
85+
def main(argv: list[str] | None = None) -> int:
86+
parser = argparse.ArgumentParser(description=__doc__)
87+
parser.add_argument(
88+
"--model-path",
89+
type=Path,
90+
required=True,
91+
help="Local checkpoint directory.",
92+
)
93+
args = parser.parse_args(argv)
94+
95+
if not args.model_path.is_dir():
96+
print(
97+
f"FAIL: --model-path {args.model_path} is not a directory",
98+
file=sys.stderr,
99+
)
100+
return 1
101+
102+
ok = True
103+
104+
missing_tokenizer = check_tokenizer(args.model_path)
105+
if missing_tokenizer:
106+
ok = False
107+
print(f"FAIL: tokenizer files missing: {missing_tokenizer}")
108+
else:
109+
print(f"OK: tokenizer files present ({len(REQUIRED_TOKENIZER_FILES)})")
110+
111+
if not check_weights(args.model_path):
112+
ok = False
113+
print(f"FAIL: no weight artefacts (looked for {WEIGHT_GLOBS})")
114+
else:
115+
print("OK: weight artefacts present")
116+
117+
gen_ok, mismatches = check_generation_config(args.model_path)
118+
if gen_ok:
119+
print("OK: generation_config.json matches v5 defaults")
120+
else:
121+
ok = False
122+
for line in mismatches:
123+
print(f"FAIL: generation_config: {line}")
124+
print(
125+
" hint: run `python scripts/hf_upload.py "
126+
f"--model-path {args.model_path} --hf-repo placeholder --skip-upload`"
127+
)
128+
129+
arch_ok, arch_detail = check_arch(args.model_path)
130+
if arch_ok:
131+
print(f"OK: config.architectures includes QWenLMHeadModel ({arch_detail})")
132+
else:
133+
ok = False
134+
print(f"FAIL: config.json: {arch_detail}")
135+
136+
print()
137+
print("RESULT:", "PASS" if ok else "FAIL")
138+
return 0 if ok else 1
139+
140+
141+
if __name__ == "__main__":
142+
raise SystemExit(main(sys.argv[1:]))

scripts/run_bestofn_offline.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""Offline best-of-N picker over cached memo JSONL sets.
2+
3+
This script is the no-API counterpart to ``scripts/bestofn_judge.py``. It
4+
loads N candidate memo sets and picks the highest-scoring memo per
5+
``custom_id`` using only the laptop-local heuristic from
6+
:func:`yuholens.agents.memo_critic.heuristic_score` — no OpenAI calls,
7+
no GPU, no network. The intended use cases are:
8+
9+
* Reproducing the best-of-N pick distribution on a flight or any
10+
offline laptop without burning batch credits.
11+
* Comparing the heuristic pick distribution against the cached judge
12+
pick distribution to validate the heuristic-vs-judge agreement
13+
claim made in ``docs/blog_post.md`` and ``docs/model-card.md``.
14+
* Smoke-testing the picker contract during development before
15+
shipping a fresh judge pass.
16+
17+
Output schema mirrors ``scripts/bestofn_pick.py`` so the picked artefacts
18+
drop into the same downstream rescore tooling. The script also emits a
19+
pick-share summary and the heuristic mean per source set.
20+
"""
21+
22+
from __future__ import annotations
23+
24+
import argparse
25+
import json
26+
import statistics
27+
import sys
28+
from collections import Counter
29+
from pathlib import Path
30+
from typing import Any
31+
32+
33+
def _ensure_yuholens_on_path() -> None:
34+
"""Insert ``src/`` into ``sys.path`` so the script runs without ``-m``."""
35+
repo_src = Path(__file__).resolve().parents[1] / "src"
36+
if str(repo_src) not in sys.path:
37+
sys.path.insert(0, str(repo_src))
38+
39+
40+
_ensure_yuholens_on_path()
41+
42+
from yuholens.agents.memo_critic import heuristic_score # noqa: E402
43+
44+
45+
def _load_memos(path: Path) -> dict[str, str]:
46+
"""Load a candidate memo JSONL file as a ``custom_id -> memo`` map.
47+
48+
Args:
49+
path: Path to a candidate memo JSONL with ``{"custom_id", "memo"}``
50+
rows.
51+
52+
Returns:
53+
Mapping keyed by ``custom_id``. Rows missing either field are
54+
skipped silently because best-of-N is robust to partial sets.
55+
"""
56+
out: dict[str, str] = {}
57+
with path.open("r", encoding="utf-8") as fh:
58+
for line in fh:
59+
line = line.strip()
60+
if not line:
61+
continue
62+
row = json.loads(line)
63+
cid = row.get("custom_id")
64+
memo = row.get("memo")
65+
if isinstance(cid, str) and isinstance(memo, str):
66+
out[cid] = memo
67+
return out
68+
69+
70+
def main() -> int:
71+
parser = argparse.ArgumentParser(description=__doc__)
72+
parser.add_argument(
73+
"--memos",
74+
type=Path,
75+
nargs="+",
76+
required=True,
77+
help="Candidate memos JSONL files, in priority order (ties go to first).",
78+
)
79+
parser.add_argument("--picked-memos", type=Path, required=True)
80+
parser.add_argument("--picked-scores", type=Path, required=True)
81+
parser.add_argument(
82+
"--labels",
83+
type=str,
84+
nargs="+",
85+
default=None,
86+
help="Human-readable labels per input set; defaults to file stems.",
87+
)
88+
args = parser.parse_args()
89+
90+
labels = args.labels or [p.stem for p in args.memos]
91+
if len(labels) != len(args.memos):
92+
raise SystemExit("--labels length must match --memos")
93+
94+
memo_sets: list[dict[str, str]] = [_load_memos(path) for path in args.memos]
95+
if not any(memo_sets):
96+
raise SystemExit("no memos loaded from any --memos input")
97+
98+
cids: list[str] = sorted(set().union(*[set(m.keys()) for m in memo_sets]))
99+
picked_memos: list[dict[str, Any]] = []
100+
picked_scores: list[dict[str, Any]] = []
101+
pick_counter: Counter[str] = Counter()
102+
per_source_scores: dict[str, list[float]] = {label: [] for label in labels}
103+
skipped = 0
104+
105+
for cid in cids:
106+
best_idx: int | None = None
107+
best_score = float("-inf")
108+
for idx, memo_set in enumerate(memo_sets):
109+
memo = memo_set.get(cid)
110+
if memo is None:
111+
continue
112+
score = heuristic_score(memo)
113+
per_source_scores[labels[idx]].append(score)
114+
if score > best_score:
115+
best_idx = idx
116+
best_score = score
117+
if best_idx is None:
118+
skipped += 1
119+
continue
120+
picked_memos.append(
121+
{"custom_id": cid, "memo": memo_sets[best_idx][cid]}
122+
)
123+
picked_scores.append(
124+
{
125+
"custom_id": cid,
126+
"heuristic_score": round(best_score, 4),
127+
"source": labels[best_idx],
128+
}
129+
)
130+
pick_counter[labels[best_idx]] += 1
131+
132+
args.picked_memos.parent.mkdir(parents=True, exist_ok=True)
133+
with args.picked_memos.open("w", encoding="utf-8") as fh:
134+
for record in picked_memos:
135+
fh.write(json.dumps(record, ensure_ascii=False) + "\n")
136+
args.picked_scores.parent.mkdir(parents=True, exist_ok=True)
137+
args.picked_scores.write_text(
138+
json.dumps(picked_scores, indent=2, ensure_ascii=False) + "\n",
139+
encoding="utf-8",
140+
)
141+
142+
print(
143+
f"[bestofn-offline] picked {len(picked_memos)} memos "
144+
f"(skipped {skipped})"
145+
)
146+
for label, count in sorted(pick_counter.items()):
147+
share = count / max(len(picked_memos), 1)
148+
scores = per_source_scores[label]
149+
if scores:
150+
mean = statistics.fmean(scores)
151+
print(
152+
f" pick_share[{label}]: {count}/{len(picked_memos)} "
153+
f"({share:.1%}) source_mean_heuristic={mean:.3f}"
154+
)
155+
else:
156+
print(f" pick_share[{label}]: {count}/{len(picked_memos)} ({share:.1%})")
157+
if picked_scores:
158+
all_picks = [r["heuristic_score"] for r in picked_scores]
159+
print(
160+
f"[bestofn-offline] picked_mean_heuristic="
161+
f"{statistics.fmean(all_picks):.3f} "
162+
f"median={statistics.median(all_picks):.3f} "
163+
f"n={len(all_picks)}"
164+
)
165+
print(f"[bestofn-offline] wrote picked memos -> {args.picked_memos}")
166+
print(f"[bestofn-offline] wrote picked scores -> {args.picked_scores}")
167+
return 0
168+
169+
170+
if __name__ == "__main__":
171+
raise SystemExit(main())

src/yuholens/agents/__main__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Allow ``python -m yuholens.agents`` to invoke the operator CLI."""
2+
3+
from __future__ import annotations
4+
5+
import sys
6+
7+
from yuholens.agents.cli import main
8+
9+
if __name__ == "__main__":
10+
raise SystemExit(main(sys.argv[1:]))

0 commit comments

Comments
 (0)