Skip to content

Commit 7cbba8d

Browse files
feat(065): security baseline section in baseline_v1.json (MCP-815)
Fill the reserved `security` block of the Spec 065 D2 regression baseline from real scorer output. Numbers measured end-to-end (no estimates): cmd/scan-eval @ 33bb6e3 over security_corpus_v1.json (43 entries) -> mcp-eval security @ 76df3a47 (corpus-id coverage on) sensitive-data (deterministic, N=1): P=0.667 R=0.100 F1=0.174 FPR=0.043 (tp=2 fp=1 tn=22 fn=18). Per-detector floats copied verbatim (full precision) so a fresh CI run diffs cleanly against the anchor. Baked-in gate (approved at Gate 2, board-accepted 2026-06-02): fpr_ceiling=0.10 recall_floor=0.05 recall is intentionally low: sensitive-data detects secret/path-leak, not prompt injection; ~18/20 malicious entries are out of scope and will be owned by future detectors. The FPR ceiling is the primary quiet-security gate; recall_floor is a total-blindness anchor. Adds baseline_test.go: shape guard + a self-consistency check that the committed anchor itself passes its own gate (fails CI on a bad refresh). Feeds C1/MCP-742 (eval.yml CI gate). Retrieval section untouched (CN-002). Co-Authored-By: Paperclip <noreply@paperclip.ing>
1 parent 33bb6e3 commit 7cbba8d

2 files changed

Lines changed: 150 additions & 1 deletion

File tree

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// Baseline shape + self-consistency guard for the Spec 065 / D2 security section
2+
// of baseline_v1.json (MCP-815). The committed security block is the regression
3+
// anchor that the CI gate (C1/MCP-742, eval.yml) diffs a fresh `mcp-eval security`
4+
// run against. This test enforces two things JSON Schema cannot:
5+
//
6+
// 1. the security block carries the per-detector metrics + gate thresholds the
7+
// scorer's SecurityReport shape requires (so the gate has something to read), and
8+
// 2. the committed numbers themselves PASS their own gate (fpr <= fpr_ceiling AND
9+
// recall >= recall_floor) — a guard so a bad future edit that bakes in a
10+
// gate-violating anchor fails CI immediately, not silently.
11+
package datasets
12+
13+
import (
14+
"encoding/json"
15+
"os"
16+
"testing"
17+
)
18+
19+
const baselineFile = "baseline_v1.json"
20+
21+
type secGate struct {
22+
FPRCeiling float64 `json:"fpr_ceiling"`
23+
RecallFloor float64 `json:"recall_floor"`
24+
}
25+
26+
type secDetector struct {
27+
Detector string `json:"detector"`
28+
Precision float64 `json:"precision"`
29+
Recall float64 `json:"recall"`
30+
F1 float64 `json:"f1"`
31+
FPR float64 `json:"fpr"`
32+
TP int `json:"tp"`
33+
FP int `json:"fp"`
34+
TN int `json:"tn"`
35+
FN int `json:"fn"`
36+
Runs int `json:"runs"`
37+
}
38+
39+
type securitySection struct {
40+
CorpusVersion string `json:"corpus_version"`
41+
RunsAveraged int `json:"runs_averaged"`
42+
PerDetector []secDetector `json:"per_detector"`
43+
Gate secGate `json:"gate"`
44+
}
45+
46+
type baselineDoc struct {
47+
Security securitySection `json:"security"`
48+
}
49+
50+
func loadBaseline(t *testing.T) baselineDoc {
51+
t.Helper()
52+
raw, err := os.ReadFile(baselineFile)
53+
if err != nil {
54+
t.Fatalf("read %s: %v", baselineFile, err)
55+
}
56+
var doc baselineDoc
57+
if err := json.Unmarshal(raw, &doc); err != nil {
58+
t.Fatalf("decode %s: %v", baselineFile, err)
59+
}
60+
return doc
61+
}
62+
63+
func TestBaseline_SecuritySectionShape(t *testing.T) {
64+
doc := loadBaseline(t)
65+
sec := doc.Security
66+
67+
if sec.CorpusVersion == "" {
68+
t.Error("security.corpus_version is empty")
69+
}
70+
if sec.RunsAveraged < 1 {
71+
t.Errorf("security.runs_averaged must be >=1, got %d", sec.RunsAveraged)
72+
}
73+
if len(sec.PerDetector) == 0 {
74+
t.Fatal("security.per_detector is empty — the gate has nothing to diff against")
75+
}
76+
if sec.Gate.FPRCeiling <= 0 || sec.Gate.FPRCeiling > 1 {
77+
t.Errorf("security.gate.fpr_ceiling out of (0,1]: %v", sec.Gate.FPRCeiling)
78+
}
79+
if sec.Gate.RecallFloor < 0 || sec.Gate.RecallFloor > 1 {
80+
t.Errorf("security.gate.recall_floor out of [0,1]: %v", sec.Gate.RecallFloor)
81+
}
82+
83+
for _, d := range sec.PerDetector {
84+
if d.Detector == "" {
85+
t.Error("per_detector entry has empty detector name")
86+
}
87+
// Confusion-matrix counts must be coherent with the reported rates.
88+
if d.TP+d.FP+d.TN+d.FN == 0 {
89+
t.Errorf("detector %q: empty confusion matrix", d.Detector)
90+
}
91+
for name, v := range map[string]float64{
92+
"precision": d.Precision, "recall": d.Recall, "f1": d.F1, "fpr": d.FPR,
93+
} {
94+
if v < 0 || v > 1 {
95+
t.Errorf("detector %q: %s out of [0,1]: %v", d.Detector, name, v)
96+
}
97+
}
98+
}
99+
}
100+
101+
// TestBaseline_SecuritySelfConsistentWithGate asserts the committed anchor itself
102+
// passes the gate it ships with. If a future refresh bakes in numbers that violate
103+
// the gate, this fails before the CI scorer ever runs — the anchor must be valid.
104+
func TestBaseline_SecuritySelfConsistentWithGate(t *testing.T) {
105+
doc := loadBaseline(t)
106+
sec := doc.Security
107+
108+
for _, d := range sec.PerDetector {
109+
if d.FPR > sec.Gate.FPRCeiling {
110+
t.Errorf("detector %q: baseline fpr %v exceeds gate.fpr_ceiling %v (anchor must pass its own gate)",
111+
d.Detector, d.FPR, sec.Gate.FPRCeiling)
112+
}
113+
if d.Recall < sec.Gate.RecallFloor {
114+
t.Errorf("detector %q: baseline recall %v below gate.recall_floor %v (anchor must pass its own gate)",
115+
d.Detector, d.Recall, sec.Gate.RecallFloor)
116+
}
117+
}
118+
}

specs/065-evaluation-foundation/datasets/baseline_v1.json

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,36 @@
2222
"recall_at_5": 0.05
2323
},
2424
"runs_averaged": 1,
25-
"security": {}
25+
"security": {
26+
"__doc__": "Spec 065 D2 regression baseline for the deterministic sensitive-data detector. The CI gate (C1/MCP-742, eval.yml) runs `mcp-eval security --verdicts <scan-eval out> --corpus security_corpus_v1.json` and FAILS if a fresh run's FPR > gate.fpr_ceiling OR recall < gate.recall_floor. recall is intentionally low: sensitive-data detects secret/path-leak, NOT prompt injection; ~18/20 malicious corpus entries (prompt_injection/shadowing/rug_pull) are structurally out of scope and will be owned by future detectors. The FPR ceiling is the primary quiet-security gate; recall_floor is a total-blindness anchor. Per-detector floats are copied verbatim (full precision) from the scorer report for exact diffing. Refresh requires re-freezing the corpus + re-review (CN-002/CN-004).",
27+
"corpus_version": "v1",
28+
"generated_from": {
29+
"scorer": "mcp-eval security @ 76df3a47",
30+
"scan_eval": "cmd/scan-eval @ 33bb6e39",
31+
"corpus": "datasets/security_corpus_v1.json (43 entries)",
32+
"detector": "internal/security sensitive-data (deterministic, N=1)",
33+
"runs": 1,
34+
"note": "Reference = current deterministic detector behavior, a regression anchor (NOT a quality target). Gate thresholds approved at Gate 2 (MCP-815 plan revision, board-accepted 2026-06-02)."
35+
},
36+
"runs_averaged": 1,
37+
"per_detector": [
38+
{
39+
"detector": "sensitive-data",
40+
"precision": 0.6666666666666666,
41+
"recall": 0.1,
42+
"f1": 0.1739130434782609,
43+
"fpr": 0.043478260869565216,
44+
"fpr_std": 0.0,
45+
"tp": 2,
46+
"fp": 1,
47+
"tn": 22,
48+
"fn": 18,
49+
"runs": 1
50+
}
51+
],
52+
"gate": {
53+
"fpr_ceiling": 0.1,
54+
"recall_floor": 0.05
55+
}
56+
}
2657
}

0 commit comments

Comments
 (0)