Skip to content

Commit 603da53

Browse files
Siddhant-K-codeOnaona-agent
authored
feat: v0.14.0 — live PII and sensitive data masking in proxy layer (#30)
* feat: live PII and sensitive data masking in proxy layer Adds masking.py with MaskingConfig and mask_event_data() that applies PII detection (emails, phones, credit cards, SSNs, AWS ARNs, custom patterns) on top of the existing secret redaction. Both the stdio proxy and HTTP proxy now accept a masking_config parameter and run masking before storing events. Closes #20 Co-authored-by: Ona <no-reply@ona.com> * fix: reduce phone regex false positives and fix ARN account ID masking - Phone regex now requires a separator (dash, dot, or parens) around the area code, preventing false positives on IP addresses and plain numeric sequences like '192.168.1.1' or version strings - AWS ARN masking now uses str.replace(..., 1) to avoid replacing the account ID string if it appears elsewhere in the ARN path Co-authored-by: Ona <no-reply@ona.com> --------- Co-authored-by: Ona <ona@gitpod.io> Co-authored-by: Ona <no-reply@ona.com>
1 parent 6fb0e15 commit 603da53

4 files changed

Lines changed: 332 additions & 6 deletions

File tree

src/agent_trace/http_proxy.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
from .models import EventType, SessionMeta, TraceEvent
3030
from .proxy import _classify_message
31-
from .redact import redact_data
31+
from .masking import MaskingConfig, mask_event_data
3232
from .store import TraceStore
3333

3434

@@ -41,13 +41,18 @@ class _ProxyHandler(BaseHTTPRequestHandler):
4141
meta: SessionMeta | None = None
4242
on_event: Callable[[TraceEvent], None] | None = None
4343
redact: bool = False
44+
masking_config: MaskingConfig | None = None
4445
pending_calls: dict[Any, TraceEvent] = {}
4546

4647
def _emit(self, event: TraceEvent) -> None:
4748
if self.meta:
4849
event.session_id = self.meta.session_id
49-
if self.redact:
50-
event.data = redact_data(event.data)
50+
if self.redact or self.masking_config:
51+
event.data = mask_event_data(
52+
event.data,
53+
config=self.masking_config,
54+
redact_secrets=self.redact,
55+
)
5156
if self.store and self.meta:
5257
self.store.append_event(self.meta.session_id, event)
5358

src/agent_trace/masking.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
"""Live PII and sensitive data masking for the proxy layer.
2+
3+
Extends redact.py with PII detection (emails, phone numbers, credit cards,
4+
SSNs, IP addresses, names in common patterns) applied in real-time as events
5+
flow through the proxy. Operates on raw JSON-RPC message bodies before they
6+
are stored as trace events.
7+
8+
Usage in proxy:
9+
from .masking import mask_event_data
10+
event.data = mask_event_data(event.data, config)
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import re
16+
from dataclasses import dataclass, field
17+
from typing import Any
18+
19+
from .redact import redact_data, REDACTED
20+
21+
# ---------------------------------------------------------------------------
22+
# PII patterns
23+
# ---------------------------------------------------------------------------
24+
25+
# Email addresses
26+
_EMAIL_RE = re.compile(
27+
r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"
28+
)
29+
30+
# Phone numbers (US and international formats)
31+
# Require a separator (space, dash, dot, or parens) to reduce false positives
32+
# on plain numeric sequences like IP addresses or version numbers.
33+
_PHONE_RE = re.compile(
34+
r"(?<!\d)(?:\+?1[\s\-.]?)?"
35+
r"(?:\(\d{3}\)[\s\-.]|\d{3}[\-.])" # area code must have separator
36+
r"\d{3}[\s\-.]?\d{4}"
37+
r"(?!\d)"
38+
)
39+
40+
# Credit card numbers (Visa, MC, Amex, Discover — 13-16 digits with optional separators)
41+
_CC_RE = re.compile(
42+
r"\b(?:4[0-9]{12}(?:[0-9]{3})?" # Visa
43+
r"|5[1-5][0-9]{14}" # Mastercard
44+
r"|3[47][0-9]{13}" # Amex
45+
r"|6(?:011|5[0-9]{2})[0-9]{12}" # Discover
46+
r"|(?:\d{4}[\s\-]){3}\d{4})\b" # Generic 16-digit with separators
47+
)
48+
49+
# US Social Security Numbers
50+
_SSN_RE = re.compile(
51+
r"\b(?!000|666|9\d{2})\d{3}[\s\-]"
52+
r"(?!00)\d{2}[\s\-]"
53+
r"(?!0000)\d{4}\b"
54+
)
55+
56+
# IPv4 addresses (private ranges are not masked; public IPs are)
57+
_IPV4_RE = re.compile(
58+
r"\b(?!(?:10|127|172\.(?:1[6-9]|2\d|3[01])|192\.168)\.\d+\.\d+)"
59+
r"(?:\d{1,3}\.){3}\d{1,3}\b"
60+
)
61+
62+
# AWS account IDs (12-digit numbers in ARNs)
63+
_AWS_ARN_RE = re.compile(r"arn:aws:[a-z0-9\-]+:[a-z0-9\-]*:(\d{12}):")
64+
65+
# Generic UUIDs that look like user/account IDs (not session IDs)
66+
# Only mask when they appear as values of keys containing "user", "account", "customer"
67+
_UUID_RE = re.compile(
68+
r"\b[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\b",
69+
re.IGNORECASE,
70+
)
71+
72+
_USER_KEYS = {
73+
"user_id", "userid", "user-id", "account_id", "accountid",
74+
"customer_id", "customerid", "person_id", "personid",
75+
"email", "phone", "mobile", "ssn", "dob", "date_of_birth",
76+
"credit_card", "card_number", "cvv", "billing_address",
77+
}
78+
79+
# ---------------------------------------------------------------------------
80+
# Masking config
81+
# ---------------------------------------------------------------------------
82+
83+
@dataclass
84+
class MaskingConfig:
85+
mask_emails: bool = True
86+
mask_phones: bool = True
87+
mask_credit_cards: bool = True
88+
mask_ssn: bool = True
89+
mask_public_ips: bool = False # off by default — too noisy
90+
mask_aws_arns: bool = True
91+
mask_user_id_keys: bool = True
92+
# Custom regex patterns provided by the user
93+
custom_patterns: list[str] = field(default_factory=list)
94+
# Replacement token (default: [MASKED])
95+
replacement: str = "[MASKED]"
96+
97+
@classmethod
98+
def from_dict(cls, d: dict) -> "MaskingConfig":
99+
return cls(
100+
mask_emails=d.get("emails", True),
101+
mask_phones=d.get("phones", True),
102+
mask_credit_cards=d.get("credit_cards", True),
103+
mask_ssn=d.get("ssn", True),
104+
mask_public_ips=d.get("public_ips", False),
105+
mask_aws_arns=d.get("aws_arns", True),
106+
mask_user_id_keys=d.get("user_id_keys", True),
107+
custom_patterns=d.get("custom_patterns", []),
108+
replacement=d.get("replacement", "[MASKED]"),
109+
)
110+
111+
@classmethod
112+
def default(cls) -> "MaskingConfig":
113+
return cls()
114+
115+
116+
# ---------------------------------------------------------------------------
117+
# String-level masking
118+
# ---------------------------------------------------------------------------
119+
120+
def _mask_string(value: str, config: MaskingConfig) -> str:
121+
"""Apply all enabled PII patterns to a string value."""
122+
result = value
123+
repl = config.replacement
124+
125+
if config.mask_emails:
126+
result = _EMAIL_RE.sub(repl, result)
127+
128+
if config.mask_phones:
129+
result = _PHONE_RE.sub(repl, result)
130+
131+
if config.mask_credit_cards:
132+
result = _CC_RE.sub(repl, result)
133+
134+
if config.mask_ssn:
135+
result = _SSN_RE.sub(repl, result)
136+
137+
if config.mask_public_ips:
138+
result = _IPV4_RE.sub(repl, result)
139+
140+
if config.mask_aws_arns:
141+
# Replace only the account ID portion (group 1) within the ARN
142+
def _mask_arn(m: re.Match) -> str:
143+
return m.group(0).replace(m.group(1), repl, 1)
144+
result = _AWS_ARN_RE.sub(_mask_arn, result)
145+
146+
for pat_str in config.custom_patterns:
147+
try:
148+
result = re.sub(pat_str, repl, result)
149+
except re.error:
150+
pass
151+
152+
return result
153+
154+
155+
# ---------------------------------------------------------------------------
156+
# Recursive data masking
157+
# ---------------------------------------------------------------------------
158+
159+
def mask_data(data: Any, config: MaskingConfig, parent_key: str = "") -> Any:
160+
"""Recursively apply PII masking to event data.
161+
162+
Runs *after* redact_data (which handles secrets/tokens). This layer
163+
handles PII: emails, phones, credit cards, SSNs, etc.
164+
"""
165+
if isinstance(data, dict):
166+
result = {}
167+
for k, v in data.items():
168+
k_lower = k.lower().strip()
169+
if config.mask_user_id_keys and k_lower in _USER_KEYS:
170+
result[k] = config.replacement if isinstance(v, str) else v
171+
else:
172+
result[k] = mask_data(v, config, parent_key=k)
173+
return result
174+
175+
if isinstance(data, list):
176+
return [mask_data(item, config, parent_key=parent_key) for item in data]
177+
178+
if isinstance(data, str):
179+
return _mask_string(data, config)
180+
181+
return data
182+
183+
184+
# ---------------------------------------------------------------------------
185+
# Combined masking entry point (secrets + PII)
186+
# ---------------------------------------------------------------------------
187+
188+
def mask_event_data(
189+
data: Any,
190+
config: MaskingConfig | None = None,
191+
redact_secrets: bool = True,
192+
) -> Any:
193+
"""Apply secret redaction and PII masking to event data.
194+
195+
This is the single entry point used by the proxy layer. It runs
196+
redact_data first (for API keys, tokens, etc.) then mask_data (for PII).
197+
"""
198+
if config is None:
199+
config = MaskingConfig.default()
200+
201+
result = data
202+
if redact_secrets:
203+
result = redact_data(result)
204+
result = mask_data(result, config)
205+
return result

src/agent_trace/proxy.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from typing import IO, Any, Callable
2929

3030
from .models import EventType, SessionMeta, TraceEvent
31-
from .redact import redact_data
31+
from .masking import MaskingConfig, mask_event_data
3232
from .store import TraceStore
3333

3434

@@ -202,18 +202,24 @@ def __init__(
202202
session_meta: SessionMeta,
203203
on_event: Callable[[TraceEvent], None] | None = None,
204204
redact: bool = False,
205+
masking_config: MaskingConfig | None = None,
205206
):
206207
self.server_command = server_command
207208
self.store = store
208209
self.meta = session_meta
209210
self.on_event = on_event
210211
self.redact = redact
212+
self.masking_config = masking_config
211213
self._pending_calls: dict[Any, TraceEvent] = {}
212214

213215
def _emit(self, event: TraceEvent) -> None:
214216
event.session_id = self.meta.session_id
215-
if self.redact:
216-
event.data = redact_data(event.data)
217+
if self.redact or self.masking_config:
218+
event.data = mask_event_data(
219+
event.data,
220+
config=self.masking_config,
221+
redact_secrets=self.redact,
222+
)
217223
self.store.append_event(self.meta.session_id, event)
218224

219225
# update counters

tests/test_masking.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
"""Tests for live PII masking (issue #20)."""
2+
3+
import os
4+
import sys
5+
import unittest
6+
7+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
8+
9+
from agent_trace.masking import MaskingConfig, mask_data, mask_event_data, _mask_string
10+
11+
12+
class TestMaskString(unittest.TestCase):
13+
def setUp(self):
14+
self.cfg = MaskingConfig.default()
15+
16+
def test_masks_email(self):
17+
result = _mask_string("Contact alice@example.com for help", self.cfg)
18+
self.assertNotIn("alice@example.com", result)
19+
self.assertIn("[MASKED]", result)
20+
21+
def test_masks_phone_us(self):
22+
result = _mask_string("Call 555-867-5309 now", self.cfg)
23+
self.assertNotIn("555-867-5309", result)
24+
25+
def test_masks_credit_card(self):
26+
result = _mask_string("Card: 4111 1111 1111 1111", self.cfg)
27+
self.assertNotIn("4111 1111 1111 1111", result)
28+
29+
def test_masks_ssn(self):
30+
result = _mask_string("SSN: 123-45-6789", self.cfg)
31+
self.assertNotIn("123-45-6789", result)
32+
33+
def test_safe_string_unchanged(self):
34+
result = _mask_string("hello world", self.cfg)
35+
self.assertEqual(result, "hello world")
36+
37+
def test_custom_pattern(self):
38+
cfg = MaskingConfig(custom_patterns=[r"\bACCT-\d+\b"])
39+
result = _mask_string("Account ACCT-12345 is active", cfg)
40+
self.assertNotIn("ACCT-12345", result)
41+
42+
def test_public_ip_off_by_default(self):
43+
cfg = MaskingConfig(mask_public_ips=False)
44+
result = _mask_string("Server at 8.8.8.8", cfg)
45+
self.assertIn("8.8.8.8", result)
46+
47+
def test_public_ip_masked_when_enabled(self):
48+
cfg = MaskingConfig(mask_public_ips=True)
49+
result = _mask_string("Server at 8.8.8.8", cfg)
50+
self.assertNotIn("8.8.8.8", result)
51+
52+
53+
class TestMaskData(unittest.TestCase):
54+
def setUp(self):
55+
self.cfg = MaskingConfig.default()
56+
57+
def test_masks_email_in_dict_value(self):
58+
data = {"message": "Send to bob@test.org please"}
59+
result = mask_data(data, self.cfg)
60+
self.assertNotIn("bob@test.org", result["message"])
61+
62+
def test_masks_sensitive_key(self):
63+
data = {"email": "alice@example.com", "name": "Alice"}
64+
result = mask_data(data, self.cfg)
65+
self.assertEqual(result["email"], "[MASKED]")
66+
self.assertEqual(result["name"], "Alice")
67+
68+
def test_masks_nested(self):
69+
data = {"user": {"contact": "call 555-123-4567"}}
70+
result = mask_data(data, self.cfg)
71+
self.assertNotIn("555-123-4567", result["user"]["contact"])
72+
73+
def test_masks_list_items(self):
74+
data = {"notes": ["email: test@example.com", "safe note"]}
75+
result = mask_data(data, self.cfg)
76+
self.assertNotIn("test@example.com", result["notes"][0])
77+
self.assertEqual(result["notes"][1], "safe note")
78+
79+
def test_non_string_unchanged(self):
80+
data = {"count": 42, "flag": True}
81+
result = mask_data(data, self.cfg)
82+
self.assertEqual(result["count"], 42)
83+
self.assertEqual(result["flag"], True)
84+
85+
86+
class TestMaskEventData(unittest.TestCase):
87+
def test_combined_secrets_and_pii(self):
88+
data = {
89+
"api_key": "sk-abc123def456ghi789jkl012mno345pqr678",
90+
"message": "Contact alice@example.com",
91+
}
92+
result = mask_event_data(data)
93+
# Secret redacted
94+
self.assertNotIn("sk-abc", str(result.get("api_key", "")))
95+
# PII masked
96+
self.assertNotIn("alice@example.com", result.get("message", ""))
97+
98+
def test_no_redact_secrets_flag(self):
99+
data = {"message": "email: test@example.com"}
100+
result = mask_event_data(data, redact_secrets=False)
101+
self.assertNotIn("test@example.com", result["message"])
102+
103+
def test_none_config_uses_defaults(self):
104+
data = {"note": "phone: 555-123-4567"}
105+
result = mask_event_data(data, config=None)
106+
self.assertNotIn("555-123-4567", result["note"])
107+
108+
109+
if __name__ == "__main__":
110+
unittest.main()

0 commit comments

Comments
 (0)