Skip to content

Commit d625700

Browse files
authored
Merge pull request #15 from Moonweave-Research/evals/cli-regression-corpus
Add CLI-level regression ship-gate corpus (evals/)
2 parents 761a3c2 + d7af64f commit d625700

4 files changed

Lines changed: 187 additions & 0 deletions

File tree

.github/workflows/live-smoke.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,6 @@ jobs:
3434
ref-verify check-claim 10.1126/science.287.5454.836 \
3535
--claim "actuation strain above 100%" \
3636
--json
37+
38+
- name: CLI regression ship gate
39+
run: PYTHONPATH=src python evals/run_cli_regression.py

evals/cli_regression.jsonl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{"id":"A1-diez-thermal-above","doi":"10.3390/polym9020059","claim":"The hyperbranched sulfur networks provide a thermal resistance above 200 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract states a high thermal resistance (>220 °C); >220 entails >200, subject+unit co-located, no hedge/scope. Happy-path anchor — must stay ACCEPT on every commit."}
2+
{"id":"A3-sessler-workfn","doi":"10.1063/1.337646","claim":"The effective work function for aluminum-polyimide is 1.7 eV.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'effective work function for aluminum-polyimide is estimated to be 1.7 eV in the temperature range'. Value is verbatim present; this row guards the physical-measurement condition suffix fix."}
3+
{"id":"A2-bellucci-30C","doi":"10.1149/1.2086797","claim":"The conductivity measurements were carried out at 30 °C.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"openalex","rationale":"CrossRef has no abstract; OpenAlex provides a DOI-bound abstract with 'Measurements were carried out at 30°C ... in the range'. This row guards OpenAlex reachability plus physical range/field condition handling."}
4+
{"id":"B2-diez-200g","doi":"10.3390/polym9020059","claim":"The sulfur networks were synthesized on a 200 g scale.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract: 'we synthesized a 200 g scale of amorphous, ... hyperbranched polymeric sulfur networks'. This row guards subject binding across descriptive comma clauses."}
5+
{"id":"E2-pelrine-117","doi":"10.1126/science.287.5454.836","claim":"Actuated strains up to 117% were demonstrated with silicone elastomers.","category":"numeric_supported","expected_verdict":"ACCEPT","must_accept":true,"must_not_accept":false,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract verbatim: 'Actuated strains up to 117% were demonstrated with silicone elastomers'. This row guards claim-side 'up to' comparator handling."}
6+
{"id":"B1-diez-fabricated","doi":"10.3390/polym9020059","claim":"This paper reports a dielectric breakdown strength of 1200 MV/m.","category":"fabricated_control","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Fabricated number absent from the abstract. Negative control — must never ACCEPT."}
7+
{"id":"C2-diaham-relational","doi":"10.1063/5.0108674","claim":"The AC conductivity follows sigma_ac proportional to omega^s with the exponent s approaching 1.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Relational claim, no value+unit; not explicitly stated in the abstract. Out of scope — must never ACCEPT."}
8+
{"id":"C1-simmonstam-relational","doi":"10.1103/PhysRevB.7.3706","claim":"The isothermal current decays as i(t) proportional to 1/t for a uniform trap distribution.","category":"relational_out_of_scope","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"openalex","rationale":"Relational. OpenAlex may make the abstract reachable, but reachability must not turn an out-of-scope relational claim into ACCEPT."}
9+
{"id":"D1-amiour-elsevier","doi":"10.1016/j.elstat.2021.103551","claim":"Kapton HN deep trap energies are in the range 0.79 to 1.05 eV.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"Elsevier withholds the abstract from CrossRef and OpenAlex; the 0.79-1.05 eV deep-trap values live in full text, not any abstract. Verdict may be UNVERIFIABLE (no abstract) or PARTIAL (Semantic Scholar abstract reached but the values are not in it) depending on S2 availability; both satisfy the must_not_accept invariant. Abstract-only ceiling — must never ACCEPT."}
10+
{"id":"D2-jonscher-relational","doi":"10.1038/267673a0","claim":"The dielectric response exponent n lies between 0 and 1.","category":"unreachable_ceiling","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"No abstract in any source (old Nature) and the claim is relational. Must remain UNVERIFIABLE."}
11+
{"id":"E1-fake-doi","doi":"10.9999/nonexistent.fake.0000","claim":"This material shows 95% energy conversion efficiency.","category":"dead_doi_control","expected_verdict":"UNVERIFIABLE","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"none","rationale":"DOI does not resolve. Must fail/UNVERIFIABLE, never ACCEPT."}
12+
{"id":"B3-diez-overaccept","doi":"10.3390/polym9020059","claim":"The polymeric sulfur networks provide a high thermal resistance of 220 °C.","category":"over_acceptance_regression","expected_verdict":"PARTIAL","must_accept":false,"must_not_accept":true,"gated_on":[],"reachable_via":"crossref","rationale":"Abstract evidence is '(>220 °C)'. An exact '220 °C' claim must NOT be entailed by '>220'; this row guards against exact-claim over-acceptance."}

evals/cli_regression.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# CLI regression corpus (ship-gate)
2+
3+
`cli_regression.jsonl` is a labeled, deterministic regression set for the
4+
`check-file` engine. It complements `evals.json` (which evaluates skill-level LLM
5+
behavior); this one pins **machine-checkable verdicts** so unit/source/matcher
6+
changes can be regression-tested without an LLM in the loop.
7+
8+
Each row carries the claim **plus** ground-truth labels:
9+
10+
| field | meaning |
11+
|---|---|
12+
| `expected_verdict` | the verdict the engine *should* reach |
13+
| `must_accept` | invariant: this row must end `ACCEPT` on every commit |
14+
| `must_not_accept` | invariant: this row must **never** end `ACCEPT` |
15+
| `gated_on` | open issues that currently block `expected_verdict` |
16+
| `reachable_via` | where an abstract exists: `crossref` / `openalex` / `none` |
17+
| `category` | `numeric_supported`, `fabricated_control`, `relational_out_of_scope`, `unreachable_ceiling`, `dead_doi_control`, `over_acceptance_regression` |
18+
19+
## Two invariant classes
20+
21+
**SAFETY (release blocker).** `must_accept` rows must stay `ACCEPT`; `must_not_accept`
22+
rows must never become `ACCEPT`. This is the tool's core promise — no fabricated,
23+
relational, unreachable, or over-accepting claim is waved through, and the one
24+
clean supported claim stays green. A break here fails the gate (non-zero exit).
25+
26+
**PROGRESS (informational).** Gated rows do not yet reach `expected_verdict`
27+
because a fix has not landed. They are reported, not failed, and flip to PASS as
28+
their `gated_on` issue is resolved. This is how the corpus tracks the roadmap.
29+
30+
## How to run
31+
32+
```bash
33+
PYTHONPATH=src python3 evals/run_cli_regression.py
34+
```
35+
36+
The same command is also wired into the manual GitHub Actions **Live Smoke**
37+
workflow. It is intentionally not part of every pull-request CI run because it
38+
depends on live public APIs.
39+
40+
Exit code is non-zero iff a SAFETY invariant is violated. (Live network: CrossRef /
41+
OpenAlex / Semantic Scholar / PubMed. Semantic-Scholar free-tier 429 only affects
42+
PROGRESS rows that depend on it, never SAFETY rows.)
43+
44+
## What the corpus encodes (snapshot, latest `main`)
45+
46+
```
47+
SAFETY: 12/12 ok | PROGRESS pending: 0
48+
```
49+
50+
- **Supported happy paths**`A1` (`>220 °C` entails `>200 °C`), `A2`
51+
(OpenAlex-reached `30 °C` conductivity measurements), `A3` (`1.7 eV`), `B2`
52+
(`200 g` sulfur-network synthesis), and `E2` (`up to 117%` actuated strain)
53+
ACCEPT and must stay green.
54+
- **Never-accept controls (all PASS)**`B1` fabricated number, `C1`/`C2` relational,
55+
`D1`/`D2` unreachable (Elsevier / old Nature, abstract-only ceiling), `E1` dead DOI.
56+
`B3` (over-acceptance) is now `PARTIAL` after #11 — kept `must_not_accept` so the bug
57+
cannot silently regress.
58+
- **No current gated false-negatives** — if future supported rows are added before
59+
their matcher/source work lands, they should use `gated_on` and report as
60+
PROGRESS rather than failing SAFETY.
61+
62+
The verdict labels for `A2`/`A3`/`B2` were grounded by fetching the live abstracts
63+
(CrossRef + OpenAlex) and confirming the value appears verbatim; no label asserts
64+
support that is not in a fetched abstract.

evals/run_cli_regression.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#!/usr/bin/env python3
2+
"""Deterministic CLI regression gate for ref-verify.
3+
4+
Runs the labeled corpus in ``cli_regression.jsonl`` through ``check-file`` and
5+
classifies every row into one of:
6+
7+
- SAFETY pass/fail — invariants that must hold on every commit:
8+
* ``must_accept`` rows must end ACCEPT (the supported happy path stays green)
9+
* ``must_not_accept`` rows must NOT end ACCEPT (no fabricated/relational/
10+
unreachable/over-accepting claim is ever waved through)
11+
A SAFETY failure exits non-zero and should block release.
12+
13+
- PROGRESS — gated rows whose ``expected_verdict`` is not yet reached because a
14+
named issue (``gated_on``) has not landed. These are reported, not failed; they
15+
flip to PASS as their fixes land.
16+
17+
Stdlib only. Usage:
18+
PYTHONPATH=src python3 evals/run_cli_regression.py
19+
"""
20+
21+
from __future__ import annotations
22+
23+
import json
24+
import subprocess
25+
import sys
26+
import tempfile
27+
from pathlib import Path
28+
29+
CORPUS = Path(__file__).with_name("cli_regression.jsonl")
30+
31+
32+
def _load_corpus() -> list[dict]:
33+
rows = []
34+
for line in CORPUS.read_text(encoding="utf-8").splitlines():
35+
line = line.strip()
36+
if line:
37+
rows.append(json.loads(line))
38+
return rows
39+
40+
41+
def _run_cli(rows: list[dict]) -> dict[str, dict]:
42+
with tempfile.NamedTemporaryFile("w", suffix=".jsonl", delete=False, encoding="utf-8") as handle:
43+
for row in rows:
44+
handle.write(json.dumps({"id": row["id"], "doi": row["doi"], "claim": row["claim"]}) + "\n")
45+
claims_path = handle.name
46+
try:
47+
proc = subprocess.run(
48+
[sys.executable, "-m", "ref_verify.cli", "check-file", claims_path, "--json"],
49+
capture_output=True,
50+
text=True,
51+
)
52+
finally:
53+
Path(claims_path).unlink(missing_ok=True)
54+
if not proc.stdout.strip():
55+
raise SystemExit(f"check-file produced no JSON. stderr:\n{proc.stderr}")
56+
payload = json.loads(proc.stdout)
57+
return {r["id"]: r for r in payload["results"]}
58+
59+
60+
def main() -> int:
61+
rows = _load_corpus()
62+
results = _run_cli(rows)
63+
64+
safety_failures: list[str] = []
65+
progress_pending: list[str] = []
66+
print(f"{'id':26}{'verdict':20}{'expected':14}{'class':13}note")
67+
print("-" * 92)
68+
for row in rows:
69+
res = results.get(row["id"], {})
70+
verdict = res.get("verdict", "MISSING")
71+
status = res.get("status", "")
72+
accepted = verdict == "ACCEPT"
73+
klass, note = "PASS", ""
74+
75+
if row.get("must_accept") and not accepted:
76+
klass, note = "SAFETY-FAIL", "must ACCEPT but did not"
77+
safety_failures.append(row["id"])
78+
elif row.get("must_not_accept") and accepted:
79+
klass, note = "SAFETY-FAIL", "must NOT ACCEPT but did"
80+
safety_failures.append(row["id"])
81+
elif row.get("must_not_accept"):
82+
# Control row: the only invariant is "never ACCEPT". The exact non-ACCEPT
83+
# verdict (UNVERIFIABLE vs PARTIAL) can vary with source availability, so it
84+
# is not pinned.
85+
klass = "PASS"
86+
elif verdict != row["expected_verdict"] and status != row["expected_verdict"]:
87+
gated = ",".join(row.get("gated_on") or []) or "?"
88+
klass, note = "PENDING", f"want {row['expected_verdict']} after {gated}"
89+
progress_pending.append(row["id"])
90+
91+
shown = verdict if verdict != "WARN" else f"{verdict}/{status}"
92+
print(f"{row['id']:26}{shown:20}{row['expected_verdict']:14}{klass:13}{note}")
93+
94+
print("-" * 92)
95+
print(
96+
f"SAFETY: {len(rows) - len(safety_failures)}/{len(rows)} ok"
97+
f" | PROGRESS pending: {len(progress_pending)}"
98+
)
99+
if safety_failures:
100+
print("SAFETY FAILURES (release blockers):", ", ".join(safety_failures))
101+
return 1
102+
if progress_pending:
103+
print("Pending (informational, not a failure):", ", ".join(progress_pending))
104+
return 0
105+
106+
107+
if __name__ == "__main__":
108+
raise SystemExit(main())

0 commit comments

Comments
 (0)