diff --git a/crates/llmtrace-security/src/datamarking.rs b/crates/llmtrace-security/src/datamarking.rs index 4a03e00f..daf73b4e 100644 --- a/crates/llmtrace-security/src/datamarking.rs +++ b/crates/llmtrace-security/src/datamarking.rs @@ -17,13 +17,14 @@ //! zones pass through unchanged. The Microsoft Spotlighting paper's //! threat model requires the instruction surface (system prompt, //! user's own question) to remain a normal natural-language signal. -//! * Whitespace is `char::is_whitespace()` (Unicode whitespace -//! property) — matches the paper's intent and covers ASCII space, -//! tab, newline, NBSP (`U+00A0`), ZWSP (`U+200B`), etc. +//! * Whitespace is [`is_substitutable_whitespace`] — the Unicode +//! `White_Space` property plus the zero-width / formatting codepoints +//! used as invisible prompt-injection vectors (ZWSP, ZWNJ, ZWJ, WJ, +//! BOM). See that function's docs for the rationale. //! * The transform is idempotent: applying it twice to the same input //! produces the same output. This is required because the proxy may //! retry requests and because the marker is also a PUA codepoint -//! that `is_whitespace()` rejects. +//! that the predicate rejects. //! * Marker selection: try the configured default first. If it appears //! inside the zone content (a vanishingly rare collision), resample //! from `PUA_RANGE` until a non-colliding codepoint is found. The @@ -193,8 +194,26 @@ impl DatamarkingTransform { // Internal helpers // --------------------------------------------------------------------------- -/// Replace every Unicode whitespace codepoint in `content` with `marker`. -/// Returns `(substituted_string, byte_delta)`. +/// Predicate for codepoints the datamarking transform must replace +/// with the marker. +/// +/// `char::is_whitespace` follows the Unicode `White_Space` property +/// which excludes zero-width / formatting codepoints (ZWSP `U+200B`, +/// ZWNJ `U+200C`, ZWJ `U+200D`, WJ `U+2060`, BOM `U+FEFF`). Those +/// codepoints are documented prompt-injection vectors used to smuggle +/// invisible instructions inside otherwise-benign Data zones, so the +/// attack surface is wider than the Unicode whitespace property. This +/// predicate closes that gap (issue #215, follow-up to PR #214). +pub fn is_substitutable_whitespace(c: char) -> bool { + c.is_whitespace() + || matches!( + c, + '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{2060}' | '\u{FEFF}' + ) +} + +/// Replace every substitutable whitespace codepoint in `content` with +/// `marker`. Returns `(substituted_string, byte_delta)`. /// /// Byte delta is `substituted.len() as i64 - content.len() as i64`. A /// positive value means the marker's UTF-8 width exceeds the @@ -204,7 +223,7 @@ fn substitute_whitespace(content: &str, marker: char) -> (String, i64) { let original_len = content.len() as i64; let mut out = String::with_capacity(content.len()); for ch in content.chars() { - if ch.is_whitespace() { + if is_substitutable_whitespace(ch) { out.push(marker); } else { out.push(ch); @@ -341,9 +360,9 @@ mod tests { let t = fixed_marker_transform(); // ASCII space, tab, newline, NBSP, vertical tab, form feed — // all six are in the Unicode `White_Space` property used by - // `char::is_whitespace()`. ZWSP (`U+200B`) is intentionally - // NOT in that property (Unicode classifies it as format/Cf, - // not whitespace) and is tested separately below. + // `char::is_whitespace()`. Zero-width formatting codepoints + // (ZWSP/ZWNJ/ZWJ/WJ/BOM) are NOT in that property; they are + // covered by the dedicated zero-width tests below. let zone = data_zone("a b\tc\nd\u{00A0}e\u{000B}f\u{000C}g"); let out = t.apply(&[zone]); let mz = &out[0]; @@ -355,19 +374,59 @@ mod tests { } #[test] - fn zwsp_is_not_substituted_by_design() { - // The brief pinned `char::is_whitespace()` as the classifier. - // ZWSP (`U+200B`) is not in the Unicode `White_Space` property, - // so it MUST pass through unchanged. Documenting this so a - // future "fix" to also substitute ZWSP comes with an - // explicit decision to widen the contract. + fn zwsp_is_substituted() { + // Inverse of the original `zwsp_is_not_substituted_by_design` + // (issue #215). ZWSP (`U+200B`) is a documented prompt-injection + // vector — it MUST be replaced by the marker, not passed through. let t = fixed_marker_transform(); let zone = data_zone("a\u{200B}b"); let out = t.apply(&[zone]); let mz = &out[0]; - assert!(mz.content.contains('\u{200B}')); - assert!(!mz.content.contains(DEFAULT_MARKER)); + assert!(!mz.content.contains('\u{200B}')); + assert!(mz.content.contains(DEFAULT_MARKER)); + // ZWSP is 3 bytes in UTF-8, same as U+E000 -> zero net delta. assert_eq!(mz.byte_delta, 0); + assert_eq!(mz.content, format!("a{}b", DEFAULT_MARKER)); + } + + #[test] + fn zwnj_is_substituted() { + let t = fixed_marker_transform(); + let zone = data_zone("a\u{200C}b"); + let out = t.apply(&[zone]); + let mz = &out[0]; + assert!(!mz.content.contains('\u{200C}')); + assert!(mz.content.contains(DEFAULT_MARKER)); + } + + #[test] + fn zwj_is_substituted() { + let t = fixed_marker_transform(); + let zone = data_zone("a\u{200D}b"); + let out = t.apply(&[zone]); + let mz = &out[0]; + assert!(!mz.content.contains('\u{200D}')); + assert!(mz.content.contains(DEFAULT_MARKER)); + } + + #[test] + fn word_joiner_is_substituted() { + let t = fixed_marker_transform(); + let zone = data_zone("a\u{2060}b"); + let out = t.apply(&[zone]); + let mz = &out[0]; + assert!(!mz.content.contains('\u{2060}')); + assert!(mz.content.contains(DEFAULT_MARKER)); + } + + #[test] + fn bom_is_substituted() { + let t = fixed_marker_transform(); + let zone = data_zone("a\u{FEFF}b"); + let out = t.apply(&[zone]); + let mz = &out[0]; + assert!(!mz.content.contains('\u{FEFF}')); + assert!(mz.content.contains(DEFAULT_MARKER)); } #[test] @@ -395,9 +454,11 @@ mod tests { #[test] fn idempotence_apply_twice_equals_apply_once() { // The marker is not whitespace, so a second pass MUST be a - // no-op (zero new replacements, zero new byte delta). + // no-op (zero new replacements, zero new byte delta). Input + // mixes ordinary whitespace and zero-width codepoints to + // exercise both classifier branches. let t = fixed_marker_transform(); - let zone = data_zone("hello world\nfoo bar"); + let zone = data_zone("hello world\nfoo\u{200B}bar\u{FEFF}baz"); let first = t.apply(&[zone]); let once_content = first[0].content.clone(); let once_byte_range = first[0].byte_range.clone();