@@ -48,6 +48,14 @@ for (let i = 0; i < evals.length; i++) {
4848 }
4949}
5050
51+ // Explicit duplicate-id guard (the sequential check only catches dups that also
52+ // break the running count; a deliberate re-use of the same id would not).
53+ const idCounts = new Map ( ) ;
54+ for ( const ev of evals ) idCounts . set ( ev . id , ( idCounts . get ( ev . id ) ?? 0 ) + 1 ) ;
55+ for ( const [ id , count ] of idCounts ) {
56+ if ( count > 1 ) errors . push ( `Duplicate eval id ${ JSON . stringify ( id ) } appears ${ count } times` ) ;
57+ }
58+
5159// ── Per-eval field and content checks ─────────────────────────────────────
5260
5361for ( const ev of evals ) {
@@ -71,13 +79,31 @@ for (const ev of evals) {
7179 errors . push ( `${ label } : 'mode' must be one of ${ VALID_MODES . join ( ", " ) } (got '${ ev . mode } ')` ) ;
7280 }
7381
82+ if ( "files" in ev && ! Array . isArray ( ev . files ) ) {
83+ errors . push ( `${ label } : 'files' must be an array when present (got ${ typeof ev . files } )` ) ;
84+ }
85+
7486 // expected_output should reference at least one risk code so reviewers know
7587 // which risk the scenario is testing
7688 if ( typeof ev . expected_output === "string" ) {
7789 const referencedCodes = RISK_CODES . filter ( ( code ) => ev . expected_output . includes ( code ) ) ;
7890 if ( referencedCodes . length === 0 ) {
7991 warnings . push ( `${ label } : expected_output does not reference any risk code (${ RISK_CODES . join ( ", " ) } )` ) ;
8092 }
93+
94+ // mode ↔ risk-code compatibility: assemble-prompt.mjs only loads the risk
95+ // definitions for that mode (test→T-codes, review/audit/debt→R-codes,
96+ // health/sweep→both). A code outside the loaded set is a dead reference —
97+ // the model is never given its definition, so the scenario cannot pass live.
98+ // RISK_CODES is R/T-prefixed by construction, so c[0] fully partitions it.
99+ const refsR = referencedCodes . filter ( ( c ) => c [ 0 ] === "R" ) ;
100+ const refsT = referencedCodes . filter ( ( c ) => c [ 0 ] === "T" ) ;
101+ if ( ev . mode === "test" && refsR . length > 0 ) {
102+ errors . push ( `${ label } : mode 'test' loads only T-codes but expected_output references ${ refsR . join ( ", " ) } ` ) ;
103+ }
104+ if ( [ "review" , "audit" , "debt" ] . includes ( ev . mode ) && refsT . length > 0 ) {
105+ errors . push ( `${ label } : mode '${ ev . mode } ' loads only R-codes but expected_output references ${ refsT . join ( ", " ) } ` ) ;
106+ }
81107 }
82108
83109 // no_risk_codes and no_health_score are optional flags that put the live
@@ -95,18 +121,40 @@ for (const ev of evals) {
95121 }
96122}
97123
124+ // ── Reverse coverage ───────────────────────────────────────────────────────
125+ // Every risk code must have at least one positive happy-path scenario. Skip the
126+ // false-positive (no_risk_codes) and health-score-suppression (no_health_score)
127+ // boundary scenarios — neither is a clean positive demonstration of a code.
128+ // CLAUDE.md requires "every new risk code gets paired coverage"; this enforces it
129+ // so a new code can never ship without a happy-path eval.
130+
131+ const coveredCodes = new Set ( ) ;
132+ for ( const ev of evals ) {
133+ if ( ev . no_risk_codes || ev . no_health_score ) continue ;
134+ if ( typeof ev . expected_output !== "string" ) continue ;
135+ for ( const code of RISK_CODES ) {
136+ if ( ev . expected_output . includes ( code ) ) coveredCodes . add ( code ) ;
137+ }
138+ }
139+ const uncoveredCodes = RISK_CODES . filter ( ( code ) => ! coveredCodes . has ( code ) ) ;
140+ if ( uncoveredCodes . length > 0 ) {
141+ errors . push ( `Risk codes with no positive eval scenario: ${ uncoveredCodes . join ( ", " ) } ` ) ;
142+ }
143+
98144// ── Report ─────────────────────────────────────────────────────────────────
99145
100- const idCheckPass = ! errors . some ( ( e ) => e . includes ( "expected id" ) ) ;
101- const fieldCheckPass = ! errors . some ( ( e ) => e . includes ( "missing required field" ) || e . includes ( "is empty" ) ) ;
146+ const idCheckPass = ! errors . some ( ( e ) => e . includes ( "expected id" ) || e . includes ( "Duplicate eval id" ) ) ;
147+ const fieldCheckPass = ! errors . some ( ( e ) => e . includes ( "missing required field" ) || e . includes ( "is empty" ) || e . includes ( "'files' must" ) ) ;
148+ const coherencePass = ! errors . some ( ( e ) => e . includes ( "loads only" ) || e . includes ( "no positive eval scenario" ) ) ;
102149const riskCodePass = warnings . length === 0 ;
103150
104151console . log ( "\nEval Suite Structural Validation" ) ;
105152console . log ( "=================================" ) ;
106- console . log ( `Total scenarios : ${ evals . length } ` ) ;
107- console . log ( `Sequential IDs : ${ idCheckPass ? "PASS" : "FAIL" } ` ) ;
108- console . log ( `Required fields : ${ fieldCheckPass ? "PASS" : "FAIL" } ` ) ;
109- console . log ( `Risk code refs : ${ riskCodePass ? "PASS" : `${ warnings . length } warning(s)` } ` ) ;
153+ console . log ( `Total scenarios : ${ evals . length } ` ) ;
154+ console . log ( `Sequential IDs : ${ idCheckPass ? "PASS" : "FAIL" } ` ) ;
155+ console . log ( `Required fields : ${ fieldCheckPass ? "PASS" : "FAIL" } ` ) ;
156+ console . log ( `Mode/risk & cover : ${ coherencePass ? "PASS" : "FAIL" } ` ) ;
157+ console . log ( `Risk code refs : ${ riskCodePass ? "PASS" : `${ warnings . length } warning(s)` } ` ) ;
110158
111159if ( errors . length > 0 ) {
112160 console . error ( "\nErrors:" ) ;
0 commit comments