chore: harden consistency — risk-code binding, deterministic health score, validation gates

hyhmrright · claude · hyhmrright · commit 044f591436f5 · 2026-06-18T20:55:56.000+08:00
Content:
- decay-risks: bind R1–R6 codes into the canonical risk headers (were bare
  "Risk N") so a code is verifiable without counting headers
- common.md: add a Code column to the decay-risk navigation index
- health-guide: make the composite score reproducible — floor each dimension
  before weighting, never skip a no-finding dimension, round half-up

Tooling:
- bump-version: propagate the version badge to README.zh-CN.md too
- validate-repo: assert the zh-CN README badge matches package.json
- run-evals: add duplicate-id, files-array, mode↔risk-compatibility, and
  reverse-coverage (every code has a positive scenario) structural checks

All gates green: npm run validate, npm test (55), npm run evals (49).

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/scripts/bump-version.mjs b/scripts/bump-version.mjs
@@ -28,9 +28,11 @@ for (const { rel, update } of manifests) {
   console.log(`  ✓ ${rel}`);
 }
 
-let readme = readFileSync(path.join(root, "README.md"), "utf8");
-readme = readme.replace(/version-[\d.]+?-blue\.svg/g, `version-${version}-blue.svg`);
-writeFileSync(path.join(root, "README.md"), readme, "utf8");
-console.log("  ✓ README.md badge");
+for (const readmeRel of ["README.md", "README.zh-CN.md"]) {
+  let readme = readFileSync(path.join(root, readmeRel), "utf8");
+  readme = readme.replace(/version-[\d.]+?-blue\.svg/g, `version-${version}-blue.svg`);
+  writeFileSync(path.join(root, readmeRel), readme, "utf8");
+  console.log(`  ✓ ${readmeRel} badge`);
+}
 
 console.log(`\nAll manifests updated to ${version}. Run npm run validate to confirm.`);
diff --git a/scripts/run-evals.mjs b/scripts/run-evals.mjs
@@ -48,6 +48,14 @@ for (let i = 0; i < evals.length; i++) {
   }
 }
 
+// Explicit duplicate-id guard (the sequential check only catches dups that also
+// break the running count; a deliberate re-use of the same id would not).
+const idCounts = new Map();
+for (const ev of evals) idCounts.set(ev.id, (idCounts.get(ev.id) ?? 0) + 1);
+for (const [id, count] of idCounts) {
+  if (count > 1) errors.push(`Duplicate eval id ${JSON.stringify(id)} appears ${count} times`);
+}
+
 // ── Per-eval field and content checks ─────────────────────────────────────
 
 for (const ev of evals) {
@@ -71,13 +79,31 @@ for (const ev of evals) {
     errors.push(`${label}: 'mode' must be one of ${VALID_MODES.join(", ")} (got '${ev.mode}')`);
   }
 
+  if ("files" in ev && !Array.isArray(ev.files)) {
+    errors.push(`${label}: 'files' must be an array when present (got ${typeof ev.files})`);
+  }
+
   // expected_output should reference at least one risk code so reviewers know
   // which risk the scenario is testing
   if (typeof ev.expected_output === "string") {
     const referencedCodes = RISK_CODES.filter((code) => ev.expected_output.includes(code));
     if (referencedCodes.length === 0) {
       warnings.push(`${label}: expected_output does not reference any risk code (${RISK_CODES.join(", ")})`);
     }
+
+    // mode ↔ risk-code compatibility: assemble-prompt.mjs only loads the risk
+    // definitions for that mode (test→T-codes, review/audit/debt→R-codes,
+    // health/sweep→both). A code outside the loaded set is a dead reference —
+    // the model is never given its definition, so the scenario cannot pass live.
+    // RISK_CODES is R/T-prefixed by construction, so c[0] fully partitions it.
+    const refsR = referencedCodes.filter((c) => c[0] === "R");
+    const refsT = referencedCodes.filter((c) => c[0] === "T");
+    if (ev.mode === "test" && refsR.length > 0) {
+      errors.push(`${label}: mode 'test' loads only T-codes but expected_output references ${refsR.join(", ")}`);
+    }
+    if (["review", "audit", "debt"].includes(ev.mode) && refsT.length > 0) {
+      errors.push(`${label}: mode '${ev.mode}' loads only R-codes but expected_output references ${refsT.join(", ")}`);
+    }
   }
 
   // no_risk_codes and no_health_score are optional flags that put the live
@@ -95,18 +121,40 @@ for (const ev of evals) {
   }
 }
 
+// ── Reverse coverage ───────────────────────────────────────────────────────
+// Every risk code must have at least one positive happy-path scenario. Skip the
+// false-positive (no_risk_codes) and health-score-suppression (no_health_score)
+// boundary scenarios — neither is a clean positive demonstration of a code.
+// CLAUDE.md requires "every new risk code gets paired coverage"; this enforces it
+// so a new code can never ship without a happy-path eval.
+
+const coveredCodes = new Set();
+for (const ev of evals) {
+  if (ev.no_risk_codes || ev.no_health_score) continue;
+  if (typeof ev.expected_output !== "string") continue;
+  for (const code of RISK_CODES) {
+    if (ev.expected_output.includes(code)) coveredCodes.add(code);
+  }
+}
+const uncoveredCodes = RISK_CODES.filter((code) => !coveredCodes.has(code));
+if (uncoveredCodes.length > 0) {
+  errors.push(`Risk codes with no positive eval scenario: ${uncoveredCodes.join(", ")}`);
+}
+
 // ── Report ─────────────────────────────────────────────────────────────────
 
-const idCheckPass = !errors.some((e) => e.includes("expected id"));
-const fieldCheckPass = !errors.some((e) => e.includes("missing required field") || e.includes("is empty"));
+const idCheckPass = !errors.some((e) => e.includes("expected id") || e.includes("Duplicate eval id"));
+const fieldCheckPass = !errors.some((e) => e.includes("missing required field") || e.includes("is empty") || e.includes("'files' must"));
+const coherencePass = !errors.some((e) => e.includes("loads only") || e.includes("no positive eval scenario"));
 const riskCodePass = warnings.length === 0;
 
 console.log("\nEval Suite Structural Validation");
 console.log("=================================");
-console.log(`Total scenarios : ${evals.length}`);
-console.log(`Sequential IDs  : ${idCheckPass ? "PASS" : "FAIL"}`);
-console.log(`Required fields : ${fieldCheckPass ? "PASS" : "FAIL"}`);
-console.log(`Risk code refs  : ${riskCodePass ? "PASS" : `${warnings.length} warning(s)`}`);
+console.log(`Total scenarios   : ${evals.length}`);
+console.log(`Sequential IDs    : ${idCheckPass ? "PASS" : "FAIL"}`);
+console.log(`Required fields   : ${fieldCheckPass ? "PASS" : "FAIL"}`);
+console.log(`Mode/risk & cover : ${coherencePass ? "PASS" : "FAIL"}`);
+console.log(`Risk code refs    : ${riskCodePass ? "PASS" : `${warnings.length} warning(s)`}`);
 
 if (errors.length > 0) {
   console.error("\nErrors:");
diff --git a/scripts/validate-repo.mjs b/scripts/validate-repo.mjs
@@ -99,6 +99,8 @@ const CANONICAL_INSTALL_CMD = "/plugin marketplace add hyhmrright/brooks-lint";
 function checkReadmeIntegrity() {
   const readme = readText("README.md");
   check(readme.includes(`version-${version}-blue.svg`), `README.md badge does not reference version ${version}`);
+  const readmeZh = readText("README.zh-CN.md");
+  check(readmeZh.includes(`version-${version}-blue.svg`), `README.zh-CN.md badge does not reference version ${version} (run npm run bump)`);
   check(readme.includes(CANONICAL_INSTALL_CMD), `README.md should contain canonical install command`);
   check(
     readme.includes(`grounded in ${sourceWord} classic engineering books`),
diff --git a/skills/_shared/common.md b/skills/_shared/common.md
@@ -101,14 +101,14 @@ to Flag" guards) live in `decay-risks.md`. Do not duplicate or edit diagnostic q
 update `decay-risks.md` directly. Book-level coverage, exceptions, and tradeoffs are in
 `source-coverage.md`.
 
-| Risk | Diagnostic Question |
-|------|---------------------|
-| Cognitive Overload | How much mental effort to understand this? |
-| Change Propagation | How many unrelated things break on one change? |
-| Knowledge Duplication | Is the same decision expressed in multiple places? |
-| Accidental Complexity | Is the code more complex than the problem? |
-| Dependency Disorder | Do dependencies flow in a consistent direction? |
-| Domain Model Distortion | Does the code faithfully represent the domain? |
+| Code | Risk | Diagnostic Question |
+|------|------|---------------------|
+| R1 | Cognitive Overload | How much mental effort to understand this? |
+| R2 | Change Propagation | How many unrelated things break on one change? |
+| R3 | Knowledge Duplication | Is the same decision expressed in multiple places? |
+| R4 | Accidental Complexity | Is the code more complex than the problem? |
+| R5 | Dependency Disorder | Do dependencies flow in a consistent direction? |
+| R6 | Domain Model Distortion | Does the code faithfully represent the domain? |
 
 ---
 
diff --git a/skills/_shared/decay-risks.md b/skills/_shared/decay-risks.md
@@ -4,7 +4,7 @@ Six patterns that cause software to degrade. Apply the Iron Law to each finding.
 
 ---
 
-## Risk 1: Cognitive Overload
+## Risk 1: Cognitive Overload (R1)
 
 **Diagnostic question:** How much mental effort does a human need to understand this?
 
@@ -57,7 +57,7 @@ Cognitive load beyond working memory causes mistakes, avoidance, and blocks the
 
 ---
 
-## Risk 2: Change Propagation
+## Risk 2: Change Propagation (R2)
 
 **Diagnostic question:** How many unrelated things break when you change one thing?
 
@@ -110,7 +110,7 @@ Each change ripples to unrelated modules, slowing velocity and multiplying regre
 
 ---
 
-## Risk 3: Knowledge Duplication
+## Risk 3: Knowledge Duplication (R3)
 
 **Diagnostic question:** Is the same decision expressed in more than one place?
 
@@ -150,7 +150,7 @@ Multiple copies drift apart silently. DRY is about decisions, not code lines.
 
 ---
 
-## Risk 4: Accidental Complexity
+## Risk 4: Accidental Complexity (R4)
 
 **Diagnostic question:** Is the code more complex than the problem it solves?
 
@@ -197,7 +197,7 @@ Accidental complexity accumulates addition by addition until developers fight sc
 
 ---
 
-## Risk 5: Dependency Disorder
+## Risk 5: Dependency Disorder (R5)
 
 **Diagnostic question:** Do dependencies flow in a consistent, predictable direction?
 
@@ -248,7 +248,7 @@ When business logic depends on infrastructure, infrastructure changes cascade in
 
 ---
 
-## Risk 6: Domain Model Distortion
+## Risk 6: Domain Model Distortion (R6)
 
 **Diagnostic question:** Does the code faithfully represent the problem it is solving?
 
diff --git a/skills/brooks-health/health-guide.md b/skills/brooks-health/health-guide.md
@@ -55,6 +55,16 @@ across the remaining three dimensions by dividing each remaining weight by
 | Debt | 0.25 | 0.25 / 0.75 = 0.33 |
 | Test | 0.20 | 0.20 / 0.75 = 0.27 |
 
+**Score rules (must be deterministic — two runs on the same codebase must agree):**
+
+- Each dimension's score is computed from the **capped** finding set shown in the dashboard
+  (the cap at Step 1 bounds both what is displayed and what is deducted — do not deduct for
+  findings beyond the cap).
+- Floor each dimension score at 0 **before** weighting.
+- A dimension with no findings scores **100** — it is never skipped. The **only** dimension
+  ever omitted is PR, and only when no diff exists (its weight is then redistributed above).
+- Round the weighted composite to the nearest integer (half-up).
+
 ### Step 3: Output Dashboard
 
 Use the dashboard report template below instead of the standard common.md template.