1717//! zones pass through unchanged. The Microsoft Spotlighting paper's
1818//! threat model requires the instruction surface (system prompt,
1919//! user's own question) to remain a normal natural-language signal.
20- //! * Whitespace is `char::is_whitespace()` (Unicode whitespace
21- //! property) — matches the paper's intent and covers ASCII space,
22- //! tab, newline, NBSP (`U+00A0`), ZWSP (`U+200B`), etc.
20+ //! * Whitespace is [`is_substitutable_whitespace`] — the Unicode
21+ //! `White_Space` property plus the zero-width / formatting codepoints
22+ //! used as invisible prompt-injection vectors (ZWSP, ZWNJ, ZWJ, WJ,
23+ //! BOM). See that function's docs for the rationale.
2324//! * The transform is idempotent: applying it twice to the same input
2425//! produces the same output. This is required because the proxy may
2526//! retry requests and because the marker is also a PUA codepoint
26- //! that `is_whitespace()` rejects.
27+ //! that the predicate rejects.
2728//! * Marker selection: try the configured default first. If it appears
2829//! inside the zone content (a vanishingly rare collision), resample
2930//! from `PUA_RANGE` until a non-colliding codepoint is found. The
@@ -193,8 +194,26 @@ impl DatamarkingTransform {
193194// Internal helpers
194195// ---------------------------------------------------------------------------
195196
196- /// Replace every Unicode whitespace codepoint in `content` with `marker`.
197- /// Returns `(substituted_string, byte_delta)`.
197+ /// Predicate for codepoints the datamarking transform must replace
198+ /// with the marker.
199+ ///
200+ /// `char::is_whitespace` follows the Unicode `White_Space` property
201+ /// which excludes zero-width / formatting codepoints (ZWSP `U+200B`,
202+ /// ZWNJ `U+200C`, ZWJ `U+200D`, WJ `U+2060`, BOM `U+FEFF`). Those
203+ /// codepoints are documented prompt-injection vectors used to smuggle
204+ /// invisible instructions inside otherwise-benign Data zones, so the
205+ /// attack surface is wider than the Unicode whitespace property. This
206+ /// predicate closes that gap (issue #215, follow-up to PR #214).
207+ pub fn is_substitutable_whitespace ( c : char ) -> bool {
208+ c. is_whitespace ( )
209+ || matches ! (
210+ c,
211+ '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{2060}' | '\u{FEFF}'
212+ )
213+ }
214+
215+ /// Replace every substitutable whitespace codepoint in `content` with
216+ /// `marker`. Returns `(substituted_string, byte_delta)`.
198217///
199218/// Byte delta is `substituted.len() as i64 - content.len() as i64`. A
200219/// positive value means the marker's UTF-8 width exceeds the
@@ -204,7 +223,7 @@ fn substitute_whitespace(content: &str, marker: char) -> (String, i64) {
204223 let original_len = content. len ( ) as i64 ;
205224 let mut out = String :: with_capacity ( content. len ( ) ) ;
206225 for ch in content. chars ( ) {
207- if ch . is_whitespace ( ) {
226+ if is_substitutable_whitespace ( ch ) {
208227 out. push ( marker) ;
209228 } else {
210229 out. push ( ch) ;
@@ -341,9 +360,9 @@ mod tests {
341360 let t = fixed_marker_transform ( ) ;
342361 // ASCII space, tab, newline, NBSP, vertical tab, form feed —
343362 // all six are in the Unicode `White_Space` property used by
344- // `char::is_whitespace()`. ZWSP (`U+200B`) is intentionally
345- // NOT in that property (Unicode classifies it as format/Cf,
346- // not whitespace) and is tested separately below.
363+ // `char::is_whitespace()`. Zero-width formatting codepoints
364+ // (ZWSP/ZWNJ/ZWJ/WJ/BOM) are NOT in that property; they are
365+ // covered by the dedicated zero-width tests below.
347366 let zone = data_zone ( "a b\t c\n d\u{00A0} e\u{000B} f\u{000C} g" ) ;
348367 let out = t. apply ( & [ zone] ) ;
349368 let mz = & out[ 0 ] ;
@@ -355,19 +374,59 @@ mod tests {
355374 }
356375
357376 #[ test]
358- fn zwsp_is_not_substituted_by_design ( ) {
359- // The brief pinned `char::is_whitespace()` as the classifier.
360- // ZWSP (`U+200B`) is not in the Unicode `White_Space` property,
361- // so it MUST pass through unchanged. Documenting this so a
362- // future "fix" to also substitute ZWSP comes with an
363- // explicit decision to widen the contract.
377+ fn zwsp_is_substituted ( ) {
378+ // Inverse of the original `zwsp_is_not_substituted_by_design`
379+ // (issue #215). ZWSP (`U+200B`) is a documented prompt-injection
380+ // vector — it MUST be replaced by the marker, not passed through.
364381 let t = fixed_marker_transform ( ) ;
365382 let zone = data_zone ( "a\u{200B} b" ) ;
366383 let out = t. apply ( & [ zone] ) ;
367384 let mz = & out[ 0 ] ;
368- assert ! ( mz. content. contains( '\u{200B}' ) ) ;
369- assert ! ( !mz. content. contains( DEFAULT_MARKER ) ) ;
385+ assert ! ( !mz. content. contains( '\u{200B}' ) ) ;
386+ assert ! ( mz. content. contains( DEFAULT_MARKER ) ) ;
387+ // ZWSP is 3 bytes in UTF-8, same as U+E000 -> zero net delta.
370388 assert_eq ! ( mz. byte_delta, 0 ) ;
389+ assert_eq ! ( mz. content, format!( "a{}b" , DEFAULT_MARKER ) ) ;
390+ }
391+
392+ #[ test]
393+ fn zwnj_is_substituted ( ) {
394+ let t = fixed_marker_transform ( ) ;
395+ let zone = data_zone ( "a\u{200C} b" ) ;
396+ let out = t. apply ( & [ zone] ) ;
397+ let mz = & out[ 0 ] ;
398+ assert ! ( !mz. content. contains( '\u{200C}' ) ) ;
399+ assert ! ( mz. content. contains( DEFAULT_MARKER ) ) ;
400+ }
401+
402+ #[ test]
403+ fn zwj_is_substituted ( ) {
404+ let t = fixed_marker_transform ( ) ;
405+ let zone = data_zone ( "a\u{200D} b" ) ;
406+ let out = t. apply ( & [ zone] ) ;
407+ let mz = & out[ 0 ] ;
408+ assert ! ( !mz. content. contains( '\u{200D}' ) ) ;
409+ assert ! ( mz. content. contains( DEFAULT_MARKER ) ) ;
410+ }
411+
412+ #[ test]
413+ fn word_joiner_is_substituted ( ) {
414+ let t = fixed_marker_transform ( ) ;
415+ let zone = data_zone ( "a\u{2060} b" ) ;
416+ let out = t. apply ( & [ zone] ) ;
417+ let mz = & out[ 0 ] ;
418+ assert ! ( !mz. content. contains( '\u{2060}' ) ) ;
419+ assert ! ( mz. content. contains( DEFAULT_MARKER ) ) ;
420+ }
421+
422+ #[ test]
423+ fn bom_is_substituted ( ) {
424+ let t = fixed_marker_transform ( ) ;
425+ let zone = data_zone ( "a\u{FEFF} b" ) ;
426+ let out = t. apply ( & [ zone] ) ;
427+ let mz = & out[ 0 ] ;
428+ assert ! ( !mz. content. contains( '\u{FEFF}' ) ) ;
429+ assert ! ( mz. content. contains( DEFAULT_MARKER ) ) ;
371430 }
372431
373432 #[ test]
@@ -395,9 +454,11 @@ mod tests {
395454 #[ test]
396455 fn idempotence_apply_twice_equals_apply_once ( ) {
397456 // The marker is not whitespace, so a second pass MUST be a
398- // no-op (zero new replacements, zero new byte delta).
457+ // no-op (zero new replacements, zero new byte delta). Input
458+ // mixes ordinary whitespace and zero-width codepoints to
459+ // exercise both classifier branches.
399460 let t = fixed_marker_transform ( ) ;
400- let zone = data_zone ( "hello world\n foo bar" ) ;
461+ let zone = data_zone ( "hello world\n foo\u{200B} bar\u{FEFF} baz " ) ;
401462 let first = t. apply ( & [ zone] ) ;
402463 let once_content = first[ 0 ] . content . clone ( ) ;
403464 let once_byte_range = first[ 0 ] . byte_range . clone ( ) ;
0 commit comments