fix(runtime): replace unpaired surrogates with U+FFFD in UTF-16 TextDecoder

tkshsbcue · tkshsbcue · commit 70c9ccba671f · 2026-04-07T20:01:36.000+05:30
The UTF-16 TextDecoder (both LE and BE) was passing raw code units directly to JsString, preserving unpaired surrogates instead of replacing them with U+FFFD as required by the WHATWG Encoding Standard. Route both decoders through a shared `decode_utf16_units` helper that uses `std::char::decode_utf16` with replacement, and simplify the UTF-16 BE decoder to borrow instead of taking ownership. Closes #4612
diff --git a/core/runtime/src/text/encodings.rs b/core/runtime/src/text/encodings.rs
@@ -21,61 +21,66 @@ pub(crate) mod utf8 {
     }
 }
 
+/// Decodes an iterator of UTF-16 code units into a well-formed `JsString`,
+/// replacing any unpaired surrogates with U+FFFD.
+///
+/// If `dangling_byte` is true and the last decoded code unit is not a high
+/// surrogate (which would already have been replaced), an additional U+FFFD
+/// is appended for the truncated trailing byte.
+fn decode_utf16_units(
+    code_units: impl IntoIterator<Item = u16>,
+    dangling_byte: bool,
+) -> boa_engine::JsString {
+    let mut string = String::new();
+    let mut last_code_unit = None;
+    string.extend(
+        std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| {
+            last_code_unit = Some(*code_unit);
+        }))
+        .map(|result| result.unwrap_or('\u{FFFD}')),
+    );
+    let trailing_high_surrogate =
+        last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit));
+    if dangling_byte && !trailing_high_surrogate {
+        string.push('\u{FFFD}');
+    }
+    boa_engine::JsString::from(string)
+}
+
 pub(crate) mod utf16le {
-    use boa_engine::{JsString, js_string};
+    use boa_engine::JsString;
 
     pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
         if strip_bom {
             input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input);
         }
 
-        // After this point, input is of even length.
-        let dangling = if input.len().is_multiple_of(2) {
-            false
-        } else {
+        let dangling_byte = !input.len().is_multiple_of(2);
+        if dangling_byte {
             input = &input[0..input.len() - 1];
-            true
-        };
-
-        let input: &[u16] = bytemuck::cast_slice(input);
-
-        if dangling {
-            JsString::from(&[JsString::from(input), js_string!("\u{FFFD}")])
-        } else {
-            JsString::from(input)
         }
+
+        let code_units: &[u16] = bytemuck::cast_slice(input);
+        super::decode_utf16_units(code_units.iter().copied(), dangling_byte)
     }
 }
 
 pub(crate) mod utf16be {
-    use boa_engine::{JsString, js_string};
+    use boa_engine::JsString;
 
-    pub(crate) fn decode(mut input: Vec<u8>, strip_bom: bool) -> JsString {
-        if strip_bom && input.starts_with(&[0xFE, 0xFF]) {
-            input.drain(..2);
+    pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
+        if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) {
+            input = rest;
         }
 
-        let mut input = input.as_mut_slice();
-        // After this point, input is of even length.
-        let dangling = if input.len().is_multiple_of(2) {
-            false
-        } else {
-            let new_len = input.len() - 1;
-            input = &mut input[0..new_len];
-            true
-        };
-
-        let input: &mut [u16] = bytemuck::cast_slice_mut(input);
-
-        // Swap the bytes.
-        for b in &mut *input {
-            *b = b.swap_bytes();
+        let dangling_byte = !input.len().is_multiple_of(2);
+        if dangling_byte {
+            input = &input[0..input.len() - 1];
         }
 
-        if dangling {
-            JsString::from(&[JsString::from(&*input), js_string!("\u{FFFD}")])
-        } else {
-            JsString::from(&*input)
-        }
+        let code_units = input
+            .chunks_exact(2)
+            .map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
+        super::decode_utf16_units(code_units, dangling_byte)
     }
 }
diff --git a/core/runtime/src/text/mod.rs b/core/runtime/src/text/mod.rs
@@ -200,10 +200,7 @@ impl TextDecoder {
         Ok(match self.encoding {
             Encoding::Utf8 => encodings::utf8::decode(data, strip_bom),
             Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom),
-            Encoding::Utf16Be => {
-                let owned = data.to_vec();
-                encodings::utf16be::decode(owned, strip_bom)
-            }
+            Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom),
         })
     }
 }
diff --git a/core/runtime/src/text/tests.rs b/core/runtime/src/text/tests.rs
@@ -1,3 +1,4 @@
+use super::encodings;
 use crate::test::{TestAction, run_test_actions_with};
 use crate::text;
 use boa_engine::object::builtins::JsUint8Array;
@@ -476,3 +477,68 @@ fn decoder_handle_data_view_offset_and_length() {
         context,
     );
 }
+
+// Test cases from issue #4612: unpaired surrogates must be replaced with U+FFFD.
+const INVALID_UTF16_CASES: &[(&[u16], &[u16])] = &[
+    // Lone high surrogate in the middle
+    (
+        &[0x0061, 0x0062, 0xD800, 0x0077, 0x0078],
+        &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
+    ),
+    // Lone high surrogate only
+    (&[0xD800], &[0xFFFD]),
+    // Two consecutive high surrogates
+    (&[0xD800, 0xD800], &[0xFFFD, 0xFFFD]),
+    // Lone low surrogate in the middle
+    (
+        &[0x0061, 0x0062, 0xDFFF, 0x0077, 0x0078],
+        &[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
+    ),
+    // Low surrogate followed by high surrogate (wrong order)
+    (&[0xDFFF, 0xD800], &[0xFFFD, 0xFFFD]),
+];
+
+#[test]
+fn decoder_utf16le_replaces_unpaired_surrogates() {
+    for (invalid, replaced) in INVALID_UTF16_CASES {
+        let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
+        for &code_unit in *invalid {
+            input_bytes.extend_from_slice(&code_unit.to_le_bytes());
+        }
+
+        let result = encodings::utf16le::decode(&input_bytes, false);
+        let expected = JsString::from(*replaced);
+        assert_eq!(result, expected, "utf16le failed for input {invalid:?}");
+    }
+}
+
+#[test]
+fn decoder_utf16be_replaces_unpaired_surrogates() {
+    for (invalid, replaced) in INVALID_UTF16_CASES {
+        let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
+        for &code_unit in *invalid {
+            input_bytes.extend_from_slice(&code_unit.to_be_bytes());
+        }
+
+        let result = encodings::utf16be::decode(&input_bytes, false);
+        let expected = JsString::from(*replaced);
+        assert_eq!(result, expected, "utf16be failed for input {invalid:?}");
+    }
+}
+
+#[test]
+fn decoder_utf16le_dangling_byte_produces_replacement() {
+    // Odd-length input: the last byte is truncated and replaced with U+FFFD
+    let input: &[u8] = &[0x41, 0x00, 0x42]; // 'A' (LE) + dangling byte
+    let result = encodings::utf16le::decode(input, false);
+    let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
+    assert_eq!(result, expected);
+}
+
+#[test]
+fn decoder_utf16be_dangling_byte_produces_replacement() {
+    let input: &[u8] = &[0x00, 0x41, 0x42]; // 'A' (BE) + dangling byte
+    let result = encodings::utf16be::decode(input, false);
+    let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
+    assert_eq!(result, expected);
+}

Original file line number	Diff line number	Diff line change
`@@ -200,10 +200,7 @@ impl TextDecoder {`
`200`	`200`	`Ok(match self.encoding {`
`201`	`201`	`Encoding::Utf8 => encodings::utf8::decode(data, strip_bom),`
`202`	`202`	`Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom),`
`203`		`- Encoding::Utf16Be => {`
`204`		`- let owned = data.to_vec();`
`205`		`- encodings::utf16be::decode(owned, strip_bom)`
`206`		`- }`
	`203`	`+ Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom),`
`207`	`204`	`})`
`208`	`205`	`}`
`209`	`206`	`}`