Merge pull request #15 from Moonweave-Research/evals/cli-regression-corpus

moonweave · web-flow · commit d62570039d9c · 2026-06-08T15:51:01.000+02:00
Add CLI-level regression ship-gate corpus (evals/)
diff --git a/.github/workflows/live-smoke.yml b/.github/workflows/live-smoke.yml
@@ -34,3 +34,6 @@ jobs:
           ref-verify check-claim 10.1126/science.287.5454.836 \
             --claim "actuation strain above 100%" \
             --json
+
+      - name: CLI regression ship gate
+        run: PYTHONPATH=src python evals/run_cli_regression.py
diff --git a/evals/cli_regression.jsonl b/evals/cli_regression.jsonl
@@ -0,0 +1,12 @@
+{"id":"A1-diez-thermal-above","doi":"10.3390/polym9020059","claim":"The hyperbranched sulfur networks provide a thermal resistance above 200 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract states a high thermal resistance (>220 °C); >220 entails >200, subject+unit co-located, no hedge/scope. Happy-path anchor — must stay ACCEPT on every commit."}
+{"id":"A3-sessler-workfn","doi":"10.1063/1.337646","claim":"The effective work function for aluminum-polyimide is 1.7 eV.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'effective work function for aluminum-polyimide is estimated to be 1.7 eV in the temperature range'. Value is verbatim present; this row guards the physical-measurement condition suffix fix."}
+{"id":"A2-bellucci-30C","doi":"10.1149/1.2086797","claim":"The conductivity measurements were carried out at 30 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"openalex","rationale":"CrossRef has no abstract; OpenAlex provides a DOI-bound abstract with 'Measurements were carried out at 30°C ... in the range'. This row guards OpenAlex reachability plus physical range/field condition handling."}
+{"id":"B2-diez-200g","doi":"10.3390/polym9020059","claim":"The sulfur networks were synthesized on a 200 g scale.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'we synthesized a 200 g scale of amorphous, ... hyperbranched polymeric sulfur networks'. This row guards subject binding across descriptive comma clauses."}
+{"id":"E2-pelrine-117","doi":"10.1126/science.287.5454.836","claim":"Actuated strains up to 117% were demonstrated with silicone elastomers.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract verbatim: 'Actuated strains up to 117% were demonstrated with silicone elastomers'. This row guards claim-side 'up to' comparator handling."}
+{"id":"B1-diez-fabricated","doi":"10.3390/polym9020059","claim":"This paper reports a dielectric breakdown strength of 1200 MV/m.","category":"fabricated_control","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Fabricated number absent from the abstract. Negative control — must never ACCEPT."}
+{"id":"C2-diaham-relational","doi":"10.1063/5.0108674","claim":"The AC conductivity follows sigma_ac proportional to omega^s with the exponent s approaching 1.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Relational claim, no value+unit; not explicitly stated in the abstract. Out of scope — must never ACCEPT."}
+{"id":"C1-simmonstam-relational","doi":"10.1103/PhysRevB.7.3706","claim":"The isothermal current decays as i(t) proportional to 1/t for a uniform trap distribution.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"openalex","rationale":"Relational. OpenAlex may make the abstract reachable, but reachability must not turn an out-of-scope relational claim into ACCEPT."}
+{"id":"D1-amiour-elsevier","doi":"10.1016/j.elstat.2021.103551","claim":"Kapton HN deep trap energies are in the range 0.79 to 1.05 eV.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"Elsevier withholds the abstract from CrossRef and OpenAlex; the 0.79-1.05 eV deep-trap values live in full text, not any abstract. Verdict may be UNVERIFIABLE (no abstract) or PARTIAL (Semantic Scholar abstract reached but the values are not in it) depending on S2 availability; both satisfy the must_not_accept invariant. Abstract-only ceiling — must never ACCEPT."}
+{"id":"D2-jonscher-relational","doi":"10.1038/267673a0","claim":"The dielectric response exponent n lies between 0 and 1.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"No abstract in any source (old Nature) and the claim is relational. Must remain UNVERIFIABLE."}
+{"id":"E1-fake-doi","doi":"10.9999/nonexistent.fake.0000","claim":"This material shows 95% energy conversion efficiency.","category":"dead_doi_control","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"DOI does not resolve. Must fail/UNVERIFIABLE, never ACCEPT."}
+{"id":"B3-diez-overaccept","doi":"10.3390/polym9020059","claim":"The polymeric sulfur networks provide a high thermal resistance of 220 °C.","category":"over_acceptance_regression","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract evidence is '(>220 °C)'. An exact '220 °C' claim must NOT be entailed by '>220'; this row guards against exact-claim over-acceptance."}
diff --git a/evals/cli_regression.md b/evals/cli_regression.md
@@ -0,0 +1,64 @@
+# CLI regression corpus (ship-gate)
+
+`cli_regression.jsonl` is a labeled, deterministic regression set for the
+`check-file` engine. It complements `evals.json` (which evaluates skill-level LLM
+behavior); this one pins **machine-checkable verdicts** so unit/source/matcher
+changes can be regression-tested without an LLM in the loop.
+
+Each row carries the claim **plus** ground-truth labels:
+
+| field | meaning |
+|---|---|
+| `expected_verdict` | the verdict the engine *should* reach |
+| `must_accept` | invariant: this row must end `ACCEPT` on every commit |
+| `must_not_accept` | invariant: this row must **never** end `ACCEPT` |
+| `gated_on` | open issues that currently block `expected_verdict` |
+| `reachable_via` | where an abstract exists: `crossref` / `openalex` / `none` |
+| `category` | `numeric_supported`, `fabricated_control`, `relational_out_of_scope`, `unreachable_ceiling`, `dead_doi_control`, `over_acceptance_regression` |
+
+## Two invariant classes
+
+**SAFETY (release blocker).** `must_accept` rows must stay `ACCEPT`; `must_not_accept`
+rows must never become `ACCEPT`. This is the tool's core promise — no fabricated,
+relational, unreachable, or over-accepting claim is waved through, and the one
+clean supported claim stays green. A break here fails the gate (non-zero exit).
+
+**PROGRESS (informational).** Gated rows do not yet reach `expected_verdict`
+because a fix has not landed. They are reported, not failed, and flip to PASS as
+their `gated_on` issue is resolved. This is how the corpus tracks the roadmap.
+
+## How to run
+
+```bash
+PYTHONPATH=src python3 evals/run_cli_regression.py
+```
+
+The same command is also wired into the manual GitHub Actions **Live Smoke**
+workflow. It is intentionally not part of every pull-request CI run because it
+depends on live public APIs.
+
+Exit code is non-zero iff a SAFETY invariant is violated. (Live network: CrossRef /
+OpenAlex / Semantic Scholar / PubMed. Semantic-Scholar free-tier 429 only affects
+PROGRESS rows that depend on it, never SAFETY rows.)
+
+## What the corpus encodes (snapshot, latest `main`)
+
+```
+SAFETY: 12/12 ok  |  PROGRESS pending: 0
+```
+
+- **Supported happy paths** — `A1` (`>220 °C` entails `>200 °C`), `A2`
+  (OpenAlex-reached `30 °C` conductivity measurements), `A3` (`1.7 eV`), `B2`
+  (`200 g` sulfur-network synthesis), and `E2` (`up to 117%` actuated strain)
+  ACCEPT and must stay green.
+- **Never-accept controls (all PASS)** — `B1` fabricated number, `C1`/`C2` relational,
+  `D1`/`D2` unreachable (Elsevier / old Nature, abstract-only ceiling), `E1` dead DOI.
+  `B3` (over-acceptance) is now `PARTIAL` after #11 — kept `must_not_accept` so the bug
+  cannot silently regress.
+- **No current gated false-negatives** — if future supported rows are added before
+  their matcher/source work lands, they should use `gated_on` and report as
+  PROGRESS rather than failing SAFETY.
+
+The verdict labels for `A2`/`A3`/`B2` were grounded by fetching the live abstracts
+(CrossRef + OpenAlex) and confirming the value appears verbatim; no label asserts
+support that is not in a fetched abstract.
diff --git a/evals/run_cli_regression.py b/evals/run_cli_regression.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Deterministic CLI regression gate for ref-verify.
+
+Runs the labeled corpus in ``cli_regression.jsonl`` through ``check-file`` and
+classifies every row into one of:
+
+- SAFETY pass/fail — invariants that must hold on every commit:
+    * ``must_accept`` rows must end ACCEPT (the supported happy path stays green)
+    * ``must_not_accept`` rows must NOT end ACCEPT (no fabricated/relational/
+      unreachable/over-accepting claim is ever waved through)
+  A SAFETY failure exits non-zero and should block release.
+
+- PROGRESS — gated rows whose ``expected_verdict`` is not yet reached because a
+  named issue (``gated_on``) has not landed. These are reported, not failed; they
+  flip to PASS as their fixes land.
+
+Stdlib only. Usage:
+    PYTHONPATH=src python3 evals/run_cli_regression.py
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+CORPUS = Path(__file__).with_name("cli_regression.jsonl")
+
+
+def _load_corpus() -> list[dict]:
+    rows = []
+    for line in CORPUS.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if line:
+            rows.append(json.loads(line))
+    return rows
+
+
+def _run_cli(rows: list[dict]) -> dict[str, dict]:
+    with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as handle:
+        for row in rows:
+            handle.write(json.dumps({"id": row["id"], "doi": row["doi"], "claim": row["claim"]}) + "\n")
+        claims_path = handle.name
+    try:
+        proc = subprocess.run(
+            [sys.executable, "-m", "ref_verify.cli", "check-file", claims_path, "--json"],
+            capture_output=True,
+            text=True,
+        )
+    finally:
+        Path(claims_path).unlink(missing_ok=True)
+    if not proc.stdout.strip():
+        raise SystemExit(f"check-file produced no JSON. stderr:\n{proc.stderr}")
+    payload = json.loads(proc.stdout)
+    return {r["id"]: r for r in payload["results"]}
+
+
+def main() -> int:
+    rows = _load_corpus()
+    results = _run_cli(rows)
+
+    safety_failures: list[str] = []
+    progress_pending: list[str] = []
+    print(f"{'id':26}{'verdict':20}{'expected':14}{'class':13}note")
+    print("-" * 92)
+    for row in rows:
+        res = results.get(row["id"], {})
+        verdict = res.get("verdict", "MISSING")
+        status = res.get("status", "")
+        accepted = verdict == "ACCEPT"
+        klass, note = "PASS", ""
+
+        if row.get("must_accept") and not accepted:
+            klass, note = "SAFETY-FAIL", "must ACCEPT but did not"
+            safety_failures.append(row["id"])
+        elif row.get("must_not_accept") and accepted:
+            klass, note = "SAFETY-FAIL", "must NOT ACCEPT but did"
+            safety_failures.append(row["id"])
+        elif row.get("must_not_accept"):
+            # Control row: the only invariant is "never ACCEPT". The exact non-ACCEPT
+            # verdict (UNVERIFIABLE vs PARTIAL) can vary with source availability, so it
+            # is not pinned.
+            klass = "PASS"
+        elif verdict != row["expected_verdict"] and status != row["expected_verdict"]:
+            gated = ",".join(row.get("gated_on") or []) or "?"
+            klass, note = "PENDING", f"want {row['expected_verdict']} after {gated}"
+            progress_pending.append(row["id"])
+
+        shown = verdict if verdict != "WARN" else f"{verdict}/{status}"
+        print(f"{row['id']:26}{shown:20}{row['expected_verdict']:14}{klass:13}{note}")
+
+    print("-" * 92)
+    print(
+        f"SAFETY: {len(rows) - len(safety_failures)}/{len(rows)} ok"
+        f"  |  PROGRESS pending: {len(progress_pending)}"
+    )
+    if safety_failures:
+        print("SAFETY FAILURES (release blockers):", ", ".join(safety_failures))
+        return 1
+    if progress_pending:
+        print("Pending (informational, not a failure):", ", ".join(progress_pending))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())