javierdejesusda
diff --git a/‎README.md‎
Lines changed: 188 additions & 107 deletions b/‎README.md‎
Lines changed: 188 additions & 107 deletions
diff --git a/‎scripts/check_release_set.py‎
Lines changed: 142 additions & 0 deletions b/‎scripts/check_release_set.py‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎scripts/run_bestofn_offline.py‎
Lines changed: 171 additions & 0 deletions b/‎scripts/run_bestofn_offline.py‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎src/yuholens/agents/__main__.py‎
Lines changed: 10 additions & 0 deletions b/‎src/yuholens/agents/__main__.py‎
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,142 @@
+"""Pre-release artefact checker for a YuhoLens checkpoint directory.
+
+Validates the invariants that gate a HuggingFace release:
+
+    1. Required tokenizer files are present.
+    2. ``generation_config.json`` matches the v5 defaults
+       (temperature 0.1, top_p 0.9, repetition_penalty 1.15,
+       no_repeat_ngram_size 0). Run ``scripts/hf_upload.py --skip-upload``
+       to repair this in place.
+    3. Model weights are present (any of pytorch_model*.bin,
+       model*.safetensors, or model.safetensors.index.json).
+    4. ``config.json`` carries the expected base architecture
+       (``QWenLMHeadModel``).
+
+The exit code is 0 when every check passes, 1 otherwise. The script
+never modifies the checkpoint — repair is the operator's job.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+V5_GENERATION_CONFIG: dict[str, object] = {
+    "do_sample": True,
+    "temperature": 0.1,
+    "top_p": 0.9,
+    "repetition_penalty": 1.15,
+    "no_repeat_ngram_size": 0,
+}
+
+REQUIRED_TOKENIZER_FILES: tuple[str, ...] = (
+    "tokenizer_config.json",
+    "tokenization_qwen.py",
+)
+
+WEIGHT_GLOBS: tuple[str, ...] = (
+    "pytorch_model*.bin",
+    "model*.safetensors",
+    "model.safetensors.index.json",
+)
+
+
+def check_tokenizer(model_path: Path) -> list[str]:
+    """Return a list of tokenizer files that are missing from ``model_path``."""
+    return [name for name in REQUIRED_TOKENIZER_FILES if not (model_path / name).exists()]
+
+
+def check_weights(model_path: Path) -> bool:
+    """Return True when at least one weight artefact exists in ``model_path``."""
+    for pattern in WEIGHT_GLOBS:
+        if any(model_path.glob(pattern)):
+            return True
+    return False
+
+
+def check_generation_config(model_path: Path) -> tuple[bool, list[str]]:
+    """Return ``(ok, mismatches)`` for the v5 generation_config invariant."""
+    config_path = model_path / "generation_config.json"
+    if not config_path.exists():
+        return False, ["generation_config.json missing"]
+    config = json.loads(config_path.read_text(encoding="utf-8"))
+    mismatches: list[str] = []
+    for key, expected in V5_GENERATION_CONFIG.items():
+        actual = config.get(key)
+        if actual != expected:
+            mismatches.append(f"{key}: expected {expected!r}, got {actual!r}")
+    return not mismatches, mismatches
+
+
+def check_arch(model_path: Path) -> tuple[bool, str]:
+    """Return ``(ok, detail)`` for the expected Qwen1 architecture string."""
+    config_path = model_path / "config.json"
+    if not config_path.exists():
+        return False, "config.json missing"
+    config = json.loads(config_path.read_text(encoding="utf-8"))
+    archs = config.get("architectures", []) or []
+    if "QWenLMHeadModel" not in archs:
+        return False, f"unexpected architectures={archs}"
+    return True, ",".join(archs)
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--model-path",
+        type=Path,
+        required=True,
+        help="Local checkpoint directory.",
+    )
+    args = parser.parse_args(argv)
+
+    if not args.model_path.is_dir():
+        print(
+            f"FAIL: --model-path {args.model_path} is not a directory",
+            file=sys.stderr,
+        )
+        return 1
+
+    ok = True
+
+    missing_tokenizer = check_tokenizer(args.model_path)
+    if missing_tokenizer:
+        ok = False
+        print(f"FAIL: tokenizer files missing: {missing_tokenizer}")
+    else:
+        print(f"OK:   tokenizer files present ({len(REQUIRED_TOKENIZER_FILES)})")
+
+    if not check_weights(args.model_path):
+        ok = False
+        print(f"FAIL: no weight artefacts (looked for {WEIGHT_GLOBS})")
+    else:
+        print("OK:   weight artefacts present")
+
+    gen_ok, mismatches = check_generation_config(args.model_path)
+    if gen_ok:
+        print("OK:   generation_config.json matches v5 defaults")
+    else:
+        ok = False
+        for line in mismatches:
+            print(f"FAIL: generation_config: {line}")
+        print(
+            "      hint: run `python scripts/hf_upload.py "
+            f"--model-path {args.model_path} --hf-repo placeholder --skip-upload`"
+        )
+
+    arch_ok, arch_detail = check_arch(args.model_path)
+    if arch_ok:
+        print(f"OK:   config.architectures includes QWenLMHeadModel ({arch_detail})")
+    else:
+        ok = False
+        print(f"FAIL: config.json: {arch_detail}")
+
+    print()
+    print("RESULT:", "PASS" if ok else "FAIL")
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
@@ -0,0 +1,171 @@
+"""Offline best-of-N picker over cached memo JSONL sets.
+
+This script is the no-API counterpart to ``scripts/bestofn_judge.py``. It
+loads N candidate memo sets and picks the highest-scoring memo per
+``custom_id`` using only the laptop-local heuristic from
+:func:`yuholens.agents.memo_critic.heuristic_score` — no OpenAI calls,
+no GPU, no network. The intended use cases are:
+
+    * Reproducing the best-of-N pick distribution on a flight or any
+      offline laptop without burning batch credits.
+    * Comparing the heuristic pick distribution against the cached judge
+      pick distribution to validate the heuristic-vs-judge agreement
+      claim made in ``docs/blog_post.md`` and ``docs/model-card.md``.
+    * Smoke-testing the picker contract during development before
+      shipping a fresh judge pass.
+
+Output schema mirrors ``scripts/bestofn_pick.py`` so the picked artefacts
+drop into the same downstream rescore tooling. The script also emits a
+pick-share summary and the heuristic mean per source set.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+
+def _ensure_yuholens_on_path() -> None:
+    """Insert ``src/`` into ``sys.path`` so the script runs without ``-m``."""
+    repo_src = Path(__file__).resolve().parents[1] / "src"
+    if str(repo_src) not in sys.path:
+        sys.path.insert(0, str(repo_src))
+
+
+_ensure_yuholens_on_path()
+
+from yuholens.agents.memo_critic import heuristic_score  # noqa: E402
+
+
+def _load_memos(path: Path) -> dict[str, str]:
+    """Load a candidate memo JSONL file as a ``custom_id -> memo`` map.
+
+    Args:
+        path: Path to a candidate memo JSONL with ``{"custom_id", "memo"}``
+            rows.
+
+    Returns:
+        Mapping keyed by ``custom_id``. Rows missing either field are
+        skipped silently because best-of-N is robust to partial sets.
+    """
+    out: dict[str, str] = {}
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            cid = row.get("custom_id")
+            memo = row.get("memo")
+            if isinstance(cid, str) and isinstance(memo, str):
+                out[cid] = memo
+    return out
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--memos",
+        type=Path,
+        nargs="+",
+        required=True,
+        help="Candidate memos JSONL files, in priority order (ties go to first).",
+    )
+    parser.add_argument("--picked-memos", type=Path, required=True)
+    parser.add_argument("--picked-scores", type=Path, required=True)
+    parser.add_argument(
+        "--labels",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Human-readable labels per input set; defaults to file stems.",
+    )
+    args = parser.parse_args()
+
+    labels = args.labels or [p.stem for p in args.memos]
+    if len(labels) != len(args.memos):
+        raise SystemExit("--labels length must match --memos")
+
+    memo_sets: list[dict[str, str]] = [_load_memos(path) for path in args.memos]
+    if not any(memo_sets):
+        raise SystemExit("no memos loaded from any --memos input")
+
+    cids: list[str] = sorted(set().union(*[set(m.keys()) for m in memo_sets]))
+    picked_memos: list[dict[str, Any]] = []
+    picked_scores: list[dict[str, Any]] = []
+    pick_counter: Counter[str] = Counter()
+    per_source_scores: dict[str, list[float]] = {label: [] for label in labels}
+    skipped = 0
+
+    for cid in cids:
+        best_idx: int | None = None
+        best_score = float("-inf")
+        for idx, memo_set in enumerate(memo_sets):
+            memo = memo_set.get(cid)
+            if memo is None:
+                continue
+            score = heuristic_score(memo)
+            per_source_scores[labels[idx]].append(score)
+            if score > best_score:
+                best_idx = idx
+                best_score = score
+        if best_idx is None:
+            skipped += 1
+            continue
+        picked_memos.append(
+            {"custom_id": cid, "memo": memo_sets[best_idx][cid]}
+        )
+        picked_scores.append(
+            {
+                "custom_id": cid,
+                "heuristic_score": round(best_score, 4),
+                "source": labels[best_idx],
+            }
+        )
+        pick_counter[labels[best_idx]] += 1
+
+    args.picked_memos.parent.mkdir(parents=True, exist_ok=True)
+    with args.picked_memos.open("w", encoding="utf-8") as fh:
+        for record in picked_memos:
+            fh.write(json.dumps(record, ensure_ascii=False) + "\n")
+    args.picked_scores.parent.mkdir(parents=True, exist_ok=True)
+    args.picked_scores.write_text(
+        json.dumps(picked_scores, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+
+    print(
+        f"[bestofn-offline] picked {len(picked_memos)} memos "
+        f"(skipped {skipped})"
+    )
+    for label, count in sorted(pick_counter.items()):
+        share = count / max(len(picked_memos), 1)
+        scores = per_source_scores[label]
+        if scores:
+            mean = statistics.fmean(scores)
+            print(
+                f"  pick_share[{label}]: {count}/{len(picked_memos)} "
+                f"({share:.1%})  source_mean_heuristic={mean:.3f}"
+            )
+        else:
+            print(f"  pick_share[{label}]: {count}/{len(picked_memos)} ({share:.1%})")
+    if picked_scores:
+        all_picks = [r["heuristic_score"] for r in picked_scores]
+        print(
+            f"[bestofn-offline] picked_mean_heuristic="
+            f"{statistics.fmean(all_picks):.3f} "
+            f"median={statistics.median(all_picks):.3f} "
+            f"n={len(all_picks)}"
+        )
+    print(f"[bestofn-offline] wrote picked memos -> {args.picked_memos}")
+    print(f"[bestofn-offline] wrote picked scores -> {args.picked_scores}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,10 @@
+"""Allow ``python -m yuholens.agents`` to invoke the operator CLI."""
+
+from __future__ import annotations
+
+import sys
+
+from yuholens.agents.cli import main
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))