Skip to content

Commit e917189

Browse files
committed
fix(pii): label detections by their source (ner vs pattern)
Pattern-matcher hits were stored and masked with an "ner:" prefix even though no NER (Named Entity Recognition) model was involved, because the redactor hard-coded the prefix for every detector. Thread a Source through NERConfig (SourceNER / SourcePattern; empty defaults to ner for back-compat) and build the synthetic id from it via NERConfig.patternID. Pattern detections now carry pattern:<GROUP> ids and [REDACTED:pattern:<GROUP>] masks; NER detections stay ner:<GROUP>. The resolver tags each detector with its source, and the doc strings / swagger / api-instructions examples are updated to match. Assisted-by: claude-code:claude-opus-4-8 [Claude Code]
1 parent 3435c90 commit e917189

8 files changed

Lines changed: 63 additions & 16 deletions

File tree

core/application/application.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ func (a *Application) PIINERResolver() pii.NERDetectorResolver {
285285
0, // patterns are deterministic — no confidence floor
286286
cfg.PIIDetectionDefaultAction(),
287287
patternEntityActions(cfg),
288+
pii.SourcePattern,
288289
), true
289290
}
290291

@@ -294,6 +295,7 @@ func (a *Application) PIINERResolver() pii.NERDetectorResolver {
294295
cfg.PIIDetectionMinScore(),
295296
cfg.PIIDetectionDefaultAction(),
296297
cfg.PIIDetectionEntityActions(),
298+
pii.SourceNER,
297299
), true
298300
}
299301
}

core/http/endpoints/localai/api_instructions.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ var instructionDefs = []instructionDef{
102102
Name: "pii-filtering",
103103
Description: "Inspect the NER-based PII filter applied to chat requests",
104104
Tags: []string{"pii"},
105-
Intro: "PII redaction is NER-based and request-side. A consuming model opts in with `pii: { enabled: true, detectors: [<model>] }` where each detector is a token-classification (token_classify) model. The detection policy lives on the detector model itself in a `pii_detection:` block: `{ min_score, default_action (mask|block|allow), entity_actions: { GROUP: action } }`. Multiple detectors union their hits; overlapping spans resolve to the strongest action (block > mask > allow). PII defaults OFF for non-proxy backends and ON for proxy-* (cloud passthroughs). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (events carry `ner:<GROUP>` ids and an 8-char hash prefix, never the matched value; admin or local-user only). The legacy regex pattern tier and its endpoints (/api/pii/patterns, /test, /decide) were removed.",
105+
Intro: "PII redaction is NER-based and request-side. A consuming model opts in with `pii: { enabled: true, detectors: [<model>] }` where each detector is a token-classification (token_classify) model. The detection policy lives on the detector model itself in a `pii_detection:` block: `{ min_score, default_action (mask|block|allow), entity_actions: { GROUP: action } }`. Multiple detectors union their hits; overlapping spans resolve to the strongest action (block > mask > allow). PII defaults OFF for non-proxy backends and ON for proxy-* (cloud passthroughs). GET /api/pii/events returns recent redaction events filtered by correlation_id / user_id / pattern_id (events carry `<source>:<GROUP>` ids — e.g. `ner:EMAIL` for the neural detector, `pattern:ANTHROPIC_KEY` for the regex pattern tier — and an 8-char hash prefix, never the matched value; admin or local-user only). The legacy regex pattern tier and its endpoints (/api/pii/patterns, /test, /decide) were removed.",
106106
},
107107
{
108108
Name: "middleware-admin",

core/http/routes/pii.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ func RegisterPIIRoutes(e *echo.Echo, app *application.Application) {
3636
// @Produce json
3737
// @Param correlation_id query string false "Correlation ID join key"
3838
// @Param user_id query string false "User id"
39-
// @Param pattern_id query string false "Detector group id (e.g. ner:EMAIL)"
39+
// @Param pattern_id query string false "Detector group id (e.g. ner:EMAIL, pattern:ANTHROPIC_KEY)"
4040
// @Param kind query string false "Event kind: pii | proxy_connect | proxy_traffic"
4141
// @Param limit query int false "Max events" default(100)
4242
// @Success 200 {object} map[string]interface{}

core/services/routing/pii/ner.go

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,22 @@ type NERConfig struct {
6060
// entities silently" — useful when the model returns a broad
6161
// taxonomy but the admin only cares about a subset.
6262
DefaultAction Action
63+
64+
// Source labels where this detector's hits come from. It becomes the
65+
// PatternID prefix on events and the [REDACTED:<id>] mask, so neural NER
66+
// detections (Source "ner") and deterministic pattern-matcher detections
67+
// (Source "pattern") are told apart in the events log and to the model.
68+
// Empty defaults to "ner" for backward compatibility.
69+
Source string
6370
}
6471

72+
// Detector source labels (the PatternID prefix). Kept short and stable —
73+
// they appear in the events log and the [REDACTED:...] mask.
74+
const (
75+
SourceNER = "ner"
76+
SourcePattern = "pattern"
77+
)
78+
6579
// ResolveAction returns the action configured for a detected entity
6680
// group, falling back to DefaultAction. Returns ("", false) when the
6781
// entity should be ignored entirely (no override + no default).
@@ -82,23 +96,32 @@ func (c NERConfig) ResolveAction(group string) (Action, bool) {
8296
// downgrades it). Unknown per-entity actions are dropped (and logged by
8397
// validActions). This is the single conversion point the application-layer
8498
// resolver uses, so the detector model's policy reaches the redactor in
85-
// exactly one shape.
86-
func NERConfigFromRaw(detector NERDetector, minScore float32, defaultAction string, entityActions map[string]string) NERConfig {
99+
// exactly one shape. source labels the detector kind (SourceNER /
100+
// SourcePattern) and becomes the PatternID prefix; empty defaults to
101+
// SourceNER.
102+
func NERConfigFromRaw(detector NERDetector, minScore float32, defaultAction string, entityActions map[string]string, source string) NERConfig {
103+
if source == "" {
104+
source = SourceNER
105+
}
87106
return NERConfig{
88107
Detector: detector,
89108
MinScore: minScore,
90109
DefaultAction: validActionOr(defaultAction, ActionMask),
91110
EntityActions: validActions(entityActions),
111+
Source: source,
92112
}
93113
}
94114

95-
// nerPatternID returns the synthetic pattern ID that audit rows carry
96-
// for NER hits. Prefixing with "ner:" keeps these distinguishable from
97-
// regex pattern IDs in the events tab and in filter queries; admins
98-
// can switch off a single entity type with the same Disabled-pattern
99-
// machinery used for regex.
100-
func nerPatternID(group string) string {
101-
return "ner:" + group
115+
// patternID returns the synthetic pattern ID that audit rows and masks carry
116+
// for this detector's hits, e.g. "ner:EMAIL" or "pattern:ANTHROPIC_KEY". The
117+
// source prefix keeps neural and deterministic detections distinguishable in
118+
// the events tab and in pattern_id filter queries.
119+
func (c NERConfig) patternID(group string) string {
120+
source := c.Source
121+
if source == "" {
122+
source = SourceNER
123+
}
124+
return source + ":" + group
102125
}
103126

104127
// errNERDetector is a NERDetector that always returns the wrapped

core/services/routing/pii/ner_test.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,23 +131,31 @@ var _ = Describe("RedactNER", func() {
131131
var _ = Describe("NERConfigFromRaw", func() {
132132
det := &stubNERDetector{}
133133

134-
It("defaults an empty default_action to mask", func() {
135-
cfg := NERConfigFromRaw(det, 0.4, "", nil)
134+
It("defaults an empty default_action to mask and an empty source to ner", func() {
135+
cfg := NERConfigFromRaw(det, 0.4, "", nil, "")
136136
Expect(cfg.DefaultAction).To(Equal(ActionMask))
137137
Expect(cfg.MinScore).To(BeNumerically("~", 0.4, 1e-6))
138+
Expect(cfg.Source).To(Equal(SourceNER))
139+
Expect(cfg.patternID("EMAIL")).To(Equal("ner:EMAIL"))
138140
})
139141

140142
It("passes through valid actions and drops invalid ones", func() {
141143
cfg := NERConfigFromRaw(det, 0, "block", map[string]string{
142144
"PASSWORD": "block",
143145
"EMAIL": "mask",
144146
"BOGUS": "nonsense", // dropped
145-
})
147+
}, SourceNER)
146148
Expect(cfg.DefaultAction).To(Equal(ActionBlock))
147149
Expect(cfg.EntityActions).To(HaveKeyWithValue("PASSWORD", ActionBlock))
148150
Expect(cfg.EntityActions).To(HaveKeyWithValue("EMAIL", ActionMask))
149151
Expect(cfg.EntityActions).NotTo(HaveKey("BOGUS"))
150152
})
153+
154+
It("prefixes pattern-detector hits with the pattern source", func() {
155+
cfg := NERConfigFromRaw(det, 0, "mask", nil, SourcePattern)
156+
Expect(cfg.Source).To(Equal(SourcePattern))
157+
Expect(cfg.patternID("ANTHROPIC_KEY")).To(Equal("pattern:ANTHROPIC_KEY"))
158+
})
151159
})
152160

153161
var _ = Describe("NERConfig.ResolveAction", func() {

core/services/routing/pii/redactor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func collectNERHits(ctx context.Context, text string, cfg NERConfig) ([]rawHit,
111111
"group", e.Group, "score", e.Score, "action", action,
112112
"start", e.Start, "end", e.End, "text", e.Text)
113113
hits = append(hits, rawHit{
114-
patternID: nerPatternID(e.Group),
114+
patternID: cfg.patternID(e.Group),
115115
action: action,
116116
start: e.Start,
117117
end: e.End,

core/services/routing/pii/redactor_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,20 @@ var _ = Describe("RedactNER emission", func() {
3030
Expect(res.Spans[0].HashPrefix).NotTo(BeEmpty(), "hash prefix must be set so audits can dedupe leaks")
3131
})
3232

33+
It("labels pattern-detector hits with the pattern source, not ner", func() {
34+
cfgs := []NERConfig{{
35+
Detector: &stubNERDetector{entities: []NEREntity{{Group: "ANTHROPIC_KEY", Start: 4, End: 24, Score: 1}}},
36+
EntityActions: map[string]Action{"ANTHROPIC_KEY": ActionMask},
37+
Source: SourcePattern,
38+
}}
39+
res, err := RedactNER(ctx, "use sk-ant-aaaaaaaaaaaaaaaa now", cfgs)
40+
Expect(err).NotTo(HaveOccurred())
41+
Expect(res.Redacted).To(ContainSubstring("[REDACTED:pattern:ANTHROPIC_KEY]"))
42+
Expect(res.Redacted).NotTo(ContainSubstring("[REDACTED:ner:"))
43+
Expect(res.Spans).To(HaveLen(1))
44+
Expect(res.Spans[0].Pattern).To(Equal("pattern:ANTHROPIC_KEY"))
45+
})
46+
3347
It("block leaves the matched span intact and sets Blocked", func() {
3448
res, err := RedactNER(ctx, "token sk-abcdef here", oneShot("PASSWORD", ActionBlock, 6, 15))
3549
Expect(err).NotTo(HaveOccurred())

core/services/routing/pii/types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ const (
6464
type Span struct {
6565
Start int
6666
End int
67-
Pattern string // synthetic detector id, "ner:<GROUP>"
67+
Pattern string // synthetic detector id, "<source>:<GROUP>" (e.g. "ner:EMAIL", "pattern:ANTHROPIC_KEY")
6868
HashPrefix string // first 8 chars of sha256(matched value); audit-safe
6969
Action Action // the action that fired for this span (after merge)
7070
Score float32 // detector confidence for the (winning) hit, 0..1

0 commit comments

Comments
 (0)