Skip to content

Commit 70c9ccb

Browse files
committed
fix(runtime): replace unpaired surrogates with U+FFFD in UTF-16 TextDecoder
The UTF-16 TextDecoder (both LE and BE) was passing raw code units directly to JsString, preserving unpaired surrogates instead of replacing them with U+FFFD as required by the WHATWG Encoding Standard. Route both decoders through a shared `decode_utf16_units` helper that uses `std::char::decode_utf16` with replacement, and simplify the UTF-16 BE decoder to borrow instead of taking ownership. Closes #4612
1 parent f5e88de commit 70c9ccb

File tree

3 files changed

+110
-42
lines changed

3 files changed

+110
-42
lines changed

core/runtime/src/text/encodings.rs

Lines changed: 43 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -21,61 +21,66 @@ pub(crate) mod utf8 {
2121
}
2222
}
2323

24+
/// Decodes an iterator of UTF-16 code units into a well-formed `JsString`,
25+
/// replacing any unpaired surrogates with U+FFFD.
26+
///
27+
/// If `dangling_byte` is true and the last decoded code unit is not a high
28+
/// surrogate (which would already have been replaced), an additional U+FFFD
29+
/// is appended for the truncated trailing byte.
30+
fn decode_utf16_units(
31+
code_units: impl IntoIterator<Item = u16>,
32+
dangling_byte: bool,
33+
) -> boa_engine::JsString {
34+
let mut string = String::new();
35+
let mut last_code_unit = None;
36+
string.extend(
37+
std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| {
38+
last_code_unit = Some(*code_unit);
39+
}))
40+
.map(|result| result.unwrap_or('\u{FFFD}')),
41+
);
42+
let trailing_high_surrogate =
43+
last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit));
44+
if dangling_byte && !trailing_high_surrogate {
45+
string.push('\u{FFFD}');
46+
}
47+
boa_engine::JsString::from(string)
48+
}
49+
2450
pub(crate) mod utf16le {
25-
use boa_engine::{JsString, js_string};
51+
use boa_engine::JsString;
2652

2753
pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
2854
if strip_bom {
2955
input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input);
3056
}
3157

32-
// After this point, input is of even length.
33-
let dangling = if input.len().is_multiple_of(2) {
34-
false
35-
} else {
58+
let dangling_byte = !input.len().is_multiple_of(2);
59+
if dangling_byte {
3660
input = &input[0..input.len() - 1];
37-
true
38-
};
39-
40-
let input: &[u16] = bytemuck::cast_slice(input);
41-
42-
if dangling {
43-
JsString::from(&[JsString::from(input), js_string!("\u{FFFD}")])
44-
} else {
45-
JsString::from(input)
4661
}
62+
63+
let code_units: &[u16] = bytemuck::cast_slice(input);
64+
super::decode_utf16_units(code_units.iter().copied(), dangling_byte)
4765
}
4866
}
4967

5068
pub(crate) mod utf16be {
51-
use boa_engine::{JsString, js_string};
69+
use boa_engine::JsString;
5270

53-
pub(crate) fn decode(mut input: Vec<u8>, strip_bom: bool) -> JsString {
54-
if strip_bom && input.starts_with(&[0xFE, 0xFF]) {
55-
input.drain(..2);
71+
pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
72+
if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) {
73+
input = rest;
5674
}
5775

58-
let mut input = input.as_mut_slice();
59-
// After this point, input is of even length.
60-
let dangling = if input.len().is_multiple_of(2) {
61-
false
62-
} else {
63-
let new_len = input.len() - 1;
64-
input = &mut input[0..new_len];
65-
true
66-
};
67-
68-
let input: &mut [u16] = bytemuck::cast_slice_mut(input);
69-
70-
// Swap the bytes.
71-
for b in &mut *input {
72-
*b = b.swap_bytes();
76+
let dangling_byte = !input.len().is_multiple_of(2);
77+
if dangling_byte {
78+
input = &input[0..input.len() - 1];
7379
}
7480

75-
if dangling {
76-
JsString::from(&[JsString::from(&*input), js_string!("\u{FFFD}")])
77-
} else {
78-
JsString::from(&*input)
79-
}
81+
let code_units = input
82+
.chunks_exact(2)
83+
.map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
84+
super::decode_utf16_units(code_units, dangling_byte)
8085
}
8186
}

core/runtime/src/text/mod.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,10 +200,7 @@ impl TextDecoder {
200200
Ok(match self.encoding {
201201
Encoding::Utf8 => encodings::utf8::decode(data, strip_bom),
202202
Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom),
203-
Encoding::Utf16Be => {
204-
let owned = data.to_vec();
205-
encodings::utf16be::decode(owned, strip_bom)
206-
}
203+
Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom),
207204
})
208205
}
209206
}

core/runtime/src/text/tests.rs

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use super::encodings;
12
use crate::test::{TestAction, run_test_actions_with};
23
use crate::text;
34
use boa_engine::object::builtins::JsUint8Array;
@@ -476,3 +477,68 @@ fn decoder_handle_data_view_offset_and_length() {
476477
context,
477478
);
478479
}
480+
481+
// Test cases from issue #4612: unpaired surrogates must be replaced with U+FFFD.
482+
const INVALID_UTF16_CASES: &[(&[u16], &[u16])] = &[
483+
// Lone high surrogate in the middle
484+
(
485+
&[0x0061, 0x0062, 0xD800, 0x0077, 0x0078],
486+
&[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
487+
),
488+
// Lone high surrogate only
489+
(&[0xD800], &[0xFFFD]),
490+
// Two consecutive high surrogates
491+
(&[0xD800, 0xD800], &[0xFFFD, 0xFFFD]),
492+
// Lone low surrogate in the middle
493+
(
494+
&[0x0061, 0x0062, 0xDFFF, 0x0077, 0x0078],
495+
&[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
496+
),
497+
// Low surrogate followed by high surrogate (wrong order)
498+
(&[0xDFFF, 0xD800], &[0xFFFD, 0xFFFD]),
499+
];
500+
501+
#[test]
502+
fn decoder_utf16le_replaces_unpaired_surrogates() {
503+
for (invalid, replaced) in INVALID_UTF16_CASES {
504+
let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
505+
for &code_unit in *invalid {
506+
input_bytes.extend_from_slice(&code_unit.to_le_bytes());
507+
}
508+
509+
let result = encodings::utf16le::decode(&input_bytes, false);
510+
let expected = JsString::from(*replaced);
511+
assert_eq!(result, expected, "utf16le failed for input {invalid:?}");
512+
}
513+
}
514+
515+
#[test]
516+
fn decoder_utf16be_replaces_unpaired_surrogates() {
517+
for (invalid, replaced) in INVALID_UTF16_CASES {
518+
let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
519+
for &code_unit in *invalid {
520+
input_bytes.extend_from_slice(&code_unit.to_be_bytes());
521+
}
522+
523+
let result = encodings::utf16be::decode(&input_bytes, false);
524+
let expected = JsString::from(*replaced);
525+
assert_eq!(result, expected, "utf16be failed for input {invalid:?}");
526+
}
527+
}
528+
529+
#[test]
530+
fn decoder_utf16le_dangling_byte_produces_replacement() {
531+
// Odd-length input: the last byte is truncated and replaced with U+FFFD
532+
let input: &[u8] = &[0x41, 0x00, 0x42]; // 'A' (LE) + dangling byte
533+
let result = encodings::utf16le::decode(input, false);
534+
let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
535+
assert_eq!(result, expected);
536+
}
537+
538+
#[test]
539+
fn decoder_utf16be_dangling_byte_produces_replacement() {
540+
let input: &[u8] = &[0x00, 0x41, 0x42]; // 'A' (BE) + dangling byte
541+
let result = encodings::utf16be::decode(input, false);
542+
let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
543+
assert_eq!(result, expected);
544+
}

0 commit comments

Comments
 (0)