feat(065): security baseline section in baseline_v1.json (MCP-815)

Dumbris · Paperclip-Paperclip · Dumbris · commit 7cbba8d6d693 · 2026-06-02T05:49:17.000+03:00
Fill the reserved `security` block of the Spec 065 D2 regression baseline from real scorer output. Numbers measured end-to-end (no estimates): cmd/scan-eval @ 33bb6e3 over security_corpus_v1.json (43 entries) -> mcp-eval security @ 76df3a47 (corpus-id coverage on) sensitive-data (deterministic, N=1): P=0.667 R=0.100 F1=0.174 FPR=0.043 (tp=2 fp=1 tn=22 fn=18). Per-detector floats copied verbatim (full precision) so a fresh CI run diffs cleanly against the anchor. Baked-in gate (approved at Gate 2, board-accepted 2026-06-02): fpr_ceiling=0.10 recall_floor=0.05 recall is intentionally low: sensitive-data detects secret/path-leak, not prompt injection; ~18/20 malicious entries are out of scope and will be owned by future detectors. The FPR ceiling is the primary quiet-security gate; recall_floor is a total-blindness anchor. Adds baseline_test.go: shape guard + a self-consistency check that the committed anchor itself passes its own gate (fails CI on a bad refresh). Feeds C1/MCP-742 (eval.yml CI gate). Retrieval section untouched (CN-002). Co-Authored-By: Paperclip <noreply@paperclip.ing>
diff --git a/specs/065-evaluation-foundation/datasets/baseline_test.go b/specs/065-evaluation-foundation/datasets/baseline_test.go
@@ -0,0 +1,118 @@
+// Baseline shape + self-consistency guard for the Spec 065 / D2 security section
+// of baseline_v1.json (MCP-815). The committed security block is the regression
+// anchor that the CI gate (C1/MCP-742, eval.yml) diffs a fresh `mcp-eval security`
+// run against. This test enforces two things JSON Schema cannot:
+//
+//  1. the security block carries the per-detector metrics + gate thresholds the
+//     scorer's SecurityReport shape requires (so the gate has something to read), and
+//  2. the committed numbers themselves PASS their own gate (fpr <= fpr_ceiling AND
+//     recall >= recall_floor) — a guard so a bad future edit that bakes in a
+//     gate-violating anchor fails CI immediately, not silently.
+package datasets
+
+import (
+	"encoding/json"
+	"os"
+	"testing"
+)
+
+const baselineFile = "baseline_v1.json"
+
+type secGate struct {
+	FPRCeiling  float64 `json:"fpr_ceiling"`
+	RecallFloor float64 `json:"recall_floor"`
+}
+
+type secDetector struct {
+	Detector  string  `json:"detector"`
+	Precision float64 `json:"precision"`
+	Recall    float64 `json:"recall"`
+	F1        float64 `json:"f1"`
+	FPR       float64 `json:"fpr"`
+	TP        int     `json:"tp"`
+	FP        int     `json:"fp"`
+	TN        int     `json:"tn"`
+	FN        int     `json:"fn"`
+	Runs      int     `json:"runs"`
+}
+
+type securitySection struct {
+	CorpusVersion string        `json:"corpus_version"`
+	RunsAveraged  int           `json:"runs_averaged"`
+	PerDetector   []secDetector `json:"per_detector"`
+	Gate          secGate       `json:"gate"`
+}
+
+type baselineDoc struct {
+	Security securitySection `json:"security"`
+}
+
+func loadBaseline(t *testing.T) baselineDoc {
+	t.Helper()
+	raw, err := os.ReadFile(baselineFile)
+	if err != nil {
+		t.Fatalf("read %s: %v", baselineFile, err)
+	}
+	var doc baselineDoc
+	if err := json.Unmarshal(raw, &doc); err != nil {
+		t.Fatalf("decode %s: %v", baselineFile, err)
+	}
+	return doc
+}
+
+func TestBaseline_SecuritySectionShape(t *testing.T) {
+	doc := loadBaseline(t)
+	sec := doc.Security
+
+	if sec.CorpusVersion == "" {
+		t.Error("security.corpus_version is empty")
+	}
+	if sec.RunsAveraged < 1 {
+		t.Errorf("security.runs_averaged must be >=1, got %d", sec.RunsAveraged)
+	}
+	if len(sec.PerDetector) == 0 {
+		t.Fatal("security.per_detector is empty — the gate has nothing to diff against")
+	}
+	if sec.Gate.FPRCeiling <= 0 || sec.Gate.FPRCeiling > 1 {
+		t.Errorf("security.gate.fpr_ceiling out of (0,1]: %v", sec.Gate.FPRCeiling)
+	}
+	if sec.Gate.RecallFloor < 0 || sec.Gate.RecallFloor > 1 {
+		t.Errorf("security.gate.recall_floor out of [0,1]: %v", sec.Gate.RecallFloor)
+	}
+
+	for _, d := range sec.PerDetector {
+		if d.Detector == "" {
+			t.Error("per_detector entry has empty detector name")
+		}
+		// Confusion-matrix counts must be coherent with the reported rates.
+		if d.TP+d.FP+d.TN+d.FN == 0 {
+			t.Errorf("detector %q: empty confusion matrix", d.Detector)
+		}
+		for name, v := range map[string]float64{
+			"precision": d.Precision, "recall": d.Recall, "f1": d.F1, "fpr": d.FPR,
+		} {
+			if v < 0 || v > 1 {
+				t.Errorf("detector %q: %s out of [0,1]: %v", d.Detector, name, v)
+			}
+		}
+	}
+}
+
+// TestBaseline_SecuritySelfConsistentWithGate asserts the committed anchor itself
+// passes the gate it ships with. If a future refresh bakes in numbers that violate
+// the gate, this fails before the CI scorer ever runs — the anchor must be valid.
+func TestBaseline_SecuritySelfConsistentWithGate(t *testing.T) {
+	doc := loadBaseline(t)
+	sec := doc.Security
+
+	for _, d := range sec.PerDetector {
+		if d.FPR > sec.Gate.FPRCeiling {
+			t.Errorf("detector %q: baseline fpr %v exceeds gate.fpr_ceiling %v (anchor must pass its own gate)",
+				d.Detector, d.FPR, sec.Gate.FPRCeiling)
+		}
+		if d.Recall < sec.Gate.RecallFloor {
+			t.Errorf("detector %q: baseline recall %v below gate.recall_floor %v (anchor must pass its own gate)",
+				d.Detector, d.Recall, sec.Gate.RecallFloor)
+		}
+	}
+}
diff --git a/specs/065-evaluation-foundation/datasets/baseline_v1.json b/specs/065-evaluation-foundation/datasets/baseline_v1.json
@@ -22,5 +22,36 @@
     "recall_at_5": 0.05
   },
   "runs_averaged": 1,
-  "security": {}
+  "security": {
+    "__doc__": "Spec 065 D2 regression baseline for the deterministic sensitive-data detector. The CI gate (C1/MCP-742, eval.yml) runs `mcp-eval security --verdicts <scan-eval out> --corpus security_corpus_v1.json` and FAILS if a fresh run's FPR > gate.fpr_ceiling OR recall < gate.recall_floor. recall is intentionally low: sensitive-data detects secret/path-leak, NOT prompt injection; ~18/20 malicious corpus entries (prompt_injection/shadowing/rug_pull) are structurally out of scope and will be owned by future detectors. The FPR ceiling is the primary quiet-security gate; recall_floor is a total-blindness anchor. Per-detector floats are copied verbatim (full precision) from the scorer report for exact diffing. Refresh requires re-freezing the corpus + re-review (CN-002/CN-004).",
+    "corpus_version": "v1",
+    "generated_from": {
+      "scorer": "mcp-eval security @ 76df3a47",
+      "scan_eval": "cmd/scan-eval @ 33bb6e39",
+      "corpus": "datasets/security_corpus_v1.json (43 entries)",
+      "detector": "internal/security sensitive-data (deterministic, N=1)",
+      "runs": 1,
+      "note": "Reference = current deterministic detector behavior, a regression anchor (NOT a quality target). Gate thresholds approved at Gate 2 (MCP-815 plan revision, board-accepted 2026-06-02)."
+    },
+    "runs_averaged": 1,
+    "per_detector": [
+      {
+        "detector": "sensitive-data",
+        "precision": 0.6666666666666666,
+        "recall": 0.1,
+        "f1": 0.1739130434782609,
+        "fpr": 0.043478260869565216,
+        "fpr_std": 0.0,
+        "tp": 2,
+        "fp": 1,
+        "tn": 22,
+        "fn": 18,
+        "runs": 1
+      }
+    ],
+    "gate": {
+      "fpr_ceiling": 0.1,
+      "recall_floor": 0.05
+    }
+  }
 }