Skip to content

Commit 94c7452

Browse files
committed
Merge remote-tracking branch 'origin/master'
2 parents 178bf28 + 9469f42 commit 94c7452

14 files changed

Lines changed: 912 additions & 61 deletions

File tree

config/mboxer.example.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,13 +120,14 @@ security:
120120
default_export_profile: scrubbed
121121
scan_enabled: true
122122
scrub_enabled: true
123+
on_residual_findings: warn # allow | warn | block
123124
scan_attachments: true
124125
quarantine_unsafe_attachments: true
125-
redact_email_addresses: false
126+
redact_email_addresses: true
126127
redact_phone_numbers: true
127128
redact_ssn_like_numbers: true
128129
redact_credit_card_like_numbers: true
129-
redact_physical_addresses: false
130+
# reserved/planned detectors (not yet implemented): physical_addresses
130131

131132
exports:
132133
notebooklm:

docs/security-roadmap.md

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,19 @@ exclude
2121
Do not export.
2222
```
2323

24-
## Future scan checks
24+
## Current scan checks
2525

2626
Message checks:
2727

2828
- email addresses
2929
- phone numbers
3030
- SSN-like values
3131
- credit-card-like values
32+
33+
## Future scan checks
34+
35+
Reserved message detector names, not active claims:
36+
3237
- postal addresses
3338
- medical terms
3439
- legal terms
@@ -63,3 +68,25 @@ The `security_findings` table should record:
6368
- excerpt or metadata
6469
- review status
6570
- created timestamp
71+
72+
Implemented export support:
73+
74+
- exports can be flagged with residual finding counts by type
75+
- exports can warn or block when projected export text still contains detected-sensitive items
76+
- export manifests and run metadata record residual counts, policy, and detector descriptors
77+
78+
## Residual export gate
79+
80+
`on_residual_findings` controls what happens after a record is projected for export and the
81+
projected body text is scanned again:
82+
83+
- `allow`: write the export and record residual counts in manifest metadata
84+
- `warn`: write the export, record residual counts, and emit a counts-only warning
85+
- `block`: abort before export files or export rows are written when residual counts are non-empty
86+
87+
The default is `warn`.
88+
89+
The scanner runs through a deterministic in-process detector registry. The active registry currently
90+
contains regex detectors for email addresses, phone numbers, SSN-like values, and credit-card-like
91+
values. Physical-address, medical, legal, financial-account, and credential detectors are reserved
92+
future names, not active detection or scrubbing claims.

mboxer-current-config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,14 @@ security:
102102
default_export_profile: scrubbed
103103
scan_enabled: true
104104
scrub_enabled: true
105+
on_residual_findings: warn # allow | warn | block
105106
scan_attachments: true
106107
quarantine_unsafe_attachments: true
107-
redact_email_addresses: false
108+
redact_email_addresses: true
108109
redact_phone_numbers: true
109110
redact_ssn_like_numbers: true
110111
redact_credit_card_like_numbers: true
111-
redact_physical_addresses: false
112+
# reserved/planned detectors (not yet implemented): physical_addresses
112113

113114
exports:
114115
notebooklm:

src/mboxer/cli.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import argparse
44
import sqlite3
5+
import sys
56
from pathlib import Path
67

78
from .accounts import AccountError
@@ -137,6 +138,7 @@ def build_parser() -> argparse.ArgumentParser:
137138
p_nlm.add_argument("--out", default=None)
138139
p_nlm.add_argument("--export-profile",
139140
choices=["raw", "reviewed", "scrubbed", "metadata-only"], default=None)
141+
p_nlm.add_argument("--findings-policy", choices=["allow", "warn", "block"], default=None)
140142
p_nlm.add_argument("--profile", default=None, help="NotebookLM limit profile")
141143
p_nlm.add_argument("--max-sources", type=int)
142144
p_nlm.add_argument("--reserved-sources", type=int)
@@ -156,6 +158,7 @@ def build_parser() -> argparse.ArgumentParser:
156158
p_jsonl.add_argument("--out", default=None)
157159
p_jsonl.add_argument("--export-profile",
158160
choices=["raw", "reviewed", "scrubbed", "metadata-only"], default=None)
161+
p_jsonl.add_argument("--findings-policy", choices=["allow", "warn", "block"], default=None)
159162
p_jsonl.set_defaults(func=cmd_export_jsonl)
160163

161164
return parser
@@ -369,6 +372,7 @@ def cmd_security_scan(args: argparse.Namespace) -> None:
369372
def cmd_export_notebooklm(args: argparse.Namespace) -> None:
370373
from .accounts import resolve_account
371374
from .exporters.notebooklm import export_notebooklm
375+
from .security.findings import ResidualFindingsBlocked
372376
config, db_path = load_runtime(args)
373377
limits = resolve_notebooklm_limits(
374378
config, args.profile,
@@ -430,10 +434,20 @@ def cmd_export_notebooklm(args: argparse.Namespace) -> None:
430434
db_path=str(db_path),
431435
config_path=args.config,
432436
warnings=warnings,
437+
findings_policy=args.findings_policy,
433438
)
439+
except ResidualFindingsBlocked as exc:
440+
print(
441+
f"BLOCKED: would export residual detected-sensitive items {exc.counts}; "
442+
"no files written.",
443+
file=sys.stderr,
444+
)
445+
raise SystemExit(2) from exc
434446
finally:
435447
conn.close()
436448

449+
for warning in stats.get("warnings", [])[len(warnings):]:
450+
print(f"WARNING: {warning}")
437451
if args.dry_run:
438452
print(f" [{account['account_key']}] Dry run: {stats.get('groups', 0)} "
439453
f"category/band groups would become source files.")
@@ -445,6 +459,7 @@ def cmd_export_notebooklm(args: argparse.Namespace) -> None:
445459
def cmd_export_jsonl(args: argparse.Namespace) -> None:
446460
from .accounts import resolve_account
447461
from .exporters.jsonl import export_jsonl
462+
from .security.findings import ResidualFindingsBlocked
448463
config, db_path = load_runtime(args)
449464
conn = open_db(db_path)
450465
try:
@@ -472,9 +487,19 @@ def cmd_export_jsonl(args: argparse.Namespace) -> None:
472487
export_profile=args.export_profile,
473488
db_path=str(db_path),
474489
config_path=args.config,
490+
findings_policy=args.findings_policy,
491+
)
492+
except ResidualFindingsBlocked as exc:
493+
print(
494+
f"BLOCKED: would export residual detected-sensitive items {exc.counts}; "
495+
"no files written.",
496+
file=sys.stderr,
475497
)
498+
raise SystemExit(2) from exc
476499
finally:
477500
conn.close()
501+
if result.get("residual_findings") and result.get("residual_findings_policy") == "warn":
502+
print(f"WARNING: residual detected-sensitive items in export: {result['residual_findings']}")
478503
print(f"[{account_key}] Wrote {result['messages_written']} messages to {out_path}")
479504

480505

src/mboxer/exporters/jsonl.py

Lines changed: 74 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
from pathlib import Path
77
from typing import Any
88

9-
from ..security.policy import default_export_profile, resolve_export_profile
9+
from ..security.findings import ResidualFindingsBlocked, merge_counts
10+
from ..security.policy import default_export_profile, resolve_export_profile, resolve_findings_policy
1011
from .projection import prepare_projection
1112

1213

@@ -22,13 +23,15 @@ def export_jsonl(
2223
export_profile: str | None = None,
2324
db_path: str | None = None,
2425
config_path: str | None = None,
26+
findings_policy: str | None = None,
2527
) -> dict[str, Any]:
2628
jsonl_config = (config.get("exports") or {}).get("jsonl") or {}
2729
include_classification = jsonl_config.get("include_classification", True)
2830
security = config.get("security") or {}
2931
config_default = default_export_profile(security.get("default_export_profile"))
3032
security_profile = config_default
3133
effective_profile = resolve_export_profile(export_profile, config_default)
34+
policy = resolve_findings_policy(security.get("on_residual_findings"), override=findings_policy)
3235

3336
if account_id is not None:
3437
rows = conn.execute(
@@ -88,48 +91,62 @@ def export_jsonl(
8891
"classifier_type": cr[5],
8992
}
9093

91-
out_path.parent.mkdir(parents=True, exist_ok=True)
92-
written = 0
9394
candidate_message_count = len(rows)
9495
excluded_message_count = 0
9596
any_scrubbed = False
97+
residual_total: dict[str, int] = {}
98+
projected_records: list[dict[str, Any]] = []
99+
100+
for row in rows:
101+
record = dict(zip(cols, row))
102+
103+
per_record_profile = (classifications.get(record["id"]) or {}).get("export_profile")
104+
projected = prepare_projection(
105+
record,
106+
config,
107+
override_profile=export_profile,
108+
record_profile=per_record_profile,
109+
clear_body_word_count_for_metadata_only=True,
110+
)
111+
if projected is None:
112+
excluded_message_count += 1
113+
continue
114+
115+
record = projected.record
116+
merge_counts(residual_total, projected.residual)
117+
if projected.was_scrubbed:
118+
any_scrubbed = True
119+
120+
record["account_key"] = account_key
121+
try:
122+
record["recipients"] = json.loads(record.pop("recipients_json") or "[]")
123+
record["cc"] = json.loads(record.pop("cc_json") or "[]")
124+
except Exception:
125+
record["recipients"] = []
126+
record["cc"] = []
127+
128+
if include_classification and record["id"] in classifications:
129+
record["classification"] = classifications[record["id"]]
130+
131+
projected_records.append(record)
132+
133+
if policy == "block" and residual_total:
134+
raise ResidualFindingsBlocked(residual_total)
135+
136+
warnings: list[str] = []
137+
if policy == "warn" and residual_total:
138+
warnings.append(f"residual detected-sensitive items in export: {residual_total}")
139+
140+
out_path.parent.mkdir(parents=True, exist_ok=True)
141+
written = 0
96142
thread_keys: set[str] = set()
97143
date_min: str | None = None
98144
date_max: str | None = None
99145
word_count = 0
100146
export_id = _start_export_run(conn, "jsonl", str(out_path), effective_profile, account_id)
101147

102148
with out_path.open("w", encoding="utf-8") as f:
103-
for row in rows:
104-
record = dict(zip(cols, row))
105-
106-
# Resolve export profile for this record
107-
per_record_profile = (classifications.get(record["id"]) or {}).get("export_profile")
108-
projected = prepare_projection(
109-
record,
110-
config,
111-
override_profile=export_profile,
112-
record_profile=per_record_profile,
113-
clear_body_word_count_for_metadata_only=True,
114-
)
115-
if projected is None:
116-
excluded_message_count += 1
117-
continue
118-
record = projected.record
119-
if projected.was_scrubbed:
120-
any_scrubbed = True
121-
122-
record["account_key"] = account_key
123-
try:
124-
record["recipients"] = json.loads(record.pop("recipients_json") or "[]")
125-
record["cc"] = json.loads(record.pop("cc_json") or "[]")
126-
except Exception:
127-
record["recipients"] = []
128-
record["cc"] = []
129-
130-
if include_classification and record["id"] in classifications:
131-
record["classification"] = classifications[record["id"]]
132-
149+
for record in projected_records:
133150
f.write(json.dumps(record, ensure_ascii=False) + "\n")
134151
written += 1
135152

@@ -171,6 +188,11 @@ def export_jsonl(
171188
export_format=jsonl_config,
172189
candidate_message_count=candidate_message_count,
173190
excluded_message_count=excluded_message_count,
191+
warnings=warnings,
192+
residual_scan_performed=True,
193+
residual_findings_total=sum(residual_total.values()),
194+
residual_findings_by_type=residual_total,
195+
residual_findings_policy=policy,
174196
)
175197
manifest_path = write_jsonl_manifest(out_path, manifest_rows)
176198
source_count = 1 if out_path.exists() else 0
@@ -211,6 +233,11 @@ def export_jsonl(
211233
message_count=written,
212234
contains_scrubbed_content=any_scrubbed,
213235
generated_sha256=manifest_rows[0]["generated_sha256"],
236+
warnings=warnings,
237+
residual_scan_performed=True,
238+
residual_findings_total=sum(residual_total.values()),
239+
residual_findings_by_type=residual_total,
240+
residual_findings_policy=policy,
214241
),
215242
export_id,
216243
),
@@ -224,6 +251,10 @@ def export_jsonl(
224251
"contains_scrubbed_content": any_scrubbed,
225252
"candidate_message_count": candidate_message_count,
226253
"excluded_message_count": excluded_message_count,
254+
"residual_findings": residual_total,
255+
"residual_findings_total": sum(residual_total.values()),
256+
"residual_findings_policy": policy,
257+
"warnings": warnings,
227258
}
228259

229260

@@ -262,6 +293,11 @@ def _jsonl_export_metadata_json(
262293
message_count: int,
263294
contains_scrubbed_content: bool,
264295
generated_sha256: str,
296+
warnings: list[str] | None,
297+
residual_scan_performed: bool,
298+
residual_findings_total: int,
299+
residual_findings_by_type: dict[str, int],
300+
residual_findings_policy: str,
265301
) -> str:
266302
from .manifest import build_safe_export_run_metadata, security_manifest_posture
267303

@@ -289,5 +325,10 @@ def _jsonl_export_metadata_json(
289325
message_count=message_count,
290326
contains_scrubbed_content=contains_scrubbed_content,
291327
generated_sha256=generated_sha256,
328+
warnings=warnings,
329+
residual_scan_performed=residual_scan_performed,
330+
residual_findings_total=residual_findings_total,
331+
residual_findings_by_type=residual_findings_by_type,
332+
residual_findings_policy=residual_findings_policy,
292333
)
293334
return json.dumps(metadata, ensure_ascii=False, sort_keys=True)

0 commit comments

Comments
 (0)