Skip to content

Commit 5c99a18

Browse files
core: Refactor XML unescape and error handling
1 parent a959bea commit 5c99a18

File tree

5 files changed

+183
-99
lines changed

5 files changed

+183
-99
lines changed

core/common/src/xml.rs

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1+
use quick_xml::escape::EscapeError as XmlEscapeError;
12
use regress::Regex;
23
use std::{ops::Deref, sync::LazyLock};
34

4-
static ENTITY_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"&[^;]*;").unwrap());
5+
// Matches XML entities like &amp; or &#39; or &#x1F600;
6+
// We exclude both ';' and '&' from the middle to avoid matching across multiple
7+
// ampersands - e.g. in "& &thing;" we want to match only "&thing;", not "& &thing;"
8+
static ENTITY_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"&[^;&]*;").unwrap());
59

610
/// Handles flash-specific XML unescaping behavior.
711
/// We accept all XML entities, and also accept standalone '&' without
812
/// a corresponding ';'
9-
pub fn custom_unescape(
10-
data: &[u8],
11-
decoder: quick_xml::Decoder,
12-
) -> Result<String, quick_xml::Error> {
13-
let input = decoder.decode(data)?;
13+
pub fn custom_unescape(input: &[u8]) -> Result<String, std::str::Utf8Error> {
14+
let input = std::str::from_utf8(input)?;
1415

1516
let re = ENTITY_REGEX.deref();
1617
let mut result = String::new();
@@ -19,23 +20,30 @@ pub fn custom_unescape(
1920
// Find all entities, and try to unescape them.
2021
// Our regular expression will skip over '&' without a matching ';',
2122
// which will preserve them as-is in the output
22-
for cap in re.find_iter(&input) {
23+
for cap in re.find_iter(input) {
2324
let start = cap.start();
2425
let end = cap.end();
26+
2527
result.push_str(&input[last_end..start]);
2628

2729
let entity = &input[start..end];
2830
// Unfortunately, we need to call this on each entity individually,
2931
// since it bails out if *any* entities in the string lack a terminating ';'
3032
match quick_xml::escape::unescape(entity) {
3133
Ok(decoded) => result.push_str(&decoded),
32-
// FIXME - check the actual error once https://github.com/tafia/quick-xml/pull/584 is merged
33-
Err(_) => result.push_str(entity),
34+
Err(err) => match err {
35+
XmlEscapeError::InvalidCharRef(_) | XmlEscapeError::UnrecognizedEntity(_, _) => {
36+
result.push_str(entity)
37+
}
38+
// The regex guarantees a semicolon
39+
XmlEscapeError::UnterminatedEntity(_) => unreachable!(),
40+
},
3441
}
3542

3643
last_end = end;
3744
}
3845

3946
result.push_str(&input[last_end..]);
47+
4048
Ok(result)
4149
}

core/src/avm1/globals/xml.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use crate::string::{AvmString, StringContext, WStr, WString};
1212
use gc_arena::barrier::unlock;
1313
use gc_arena::lock::Lock;
1414
use gc_arena::{Collect, Gc};
15+
use quick_xml::encoding::EncodingError;
1516
use quick_xml::errors::IllFormedError;
1617
use quick_xml::events::attributes::AttrError;
1718
use quick_xml::{Reader, events::Event};
@@ -174,17 +175,15 @@ impl<'gc> Xml<'gc> {
174175

175176
match event {
176177
Event::Start(bs) => {
177-
let child =
178-
XmlNode::from_start_event(activation, bs, self.id_map(), parser.decoder())?;
178+
let child = XmlNode::from_start_event(activation, bs, self.id_map())?;
179179
open_tags
180180
.last()
181181
.unwrap()
182182
.append_child(activation.gc(), child);
183183
open_tags.push(child);
184184
}
185185
Event::Empty(bs) => {
186-
let child =
187-
XmlNode::from_start_event(activation, bs, self.id_map(), parser.decoder())?;
186+
let child = XmlNode::from_start_event(activation, bs, self.id_map())?;
188187
open_tags
189188
.last()
190189
.unwrap()
@@ -195,7 +194,9 @@ impl<'gc> Xml<'gc> {
195194
}
196195
Event::Text(bt) => {
197196
Self::handle_text_cdata(
198-
custom_unescape(&bt.into_inner(), parser.decoder())?.as_bytes(),
197+
custom_unescape(&bt)
198+
.map_err(|e| quick_xml::Error::Encoding(EncodingError::Utf8(e)))?
199+
.as_bytes(),
199200
ignore_white,
200201
&open_tags,
201202
activation,

core/src/avm1/xml/tree.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use gc_arena::{
88
Collect, Gc, Mutation,
99
lock::{Lock, RefLock},
1010
};
11+
use quick_xml::encoding::EncodingError;
1112
use quick_xml::escape::escape;
1213
use quick_xml::events::BytesStart;
1314
use ruffle_common::xml::custom_unescape;
@@ -83,18 +84,19 @@ impl<'gc> XmlNode<'gc> {
8384
activation: &mut Activation<'_, 'gc>,
8485
bs: BytesStart<'_>,
8586
id_map: Object<'gc>,
86-
decoder: quick_xml::Decoder,
8787
) -> Result<Self, quick_xml::Error> {
8888
let name = AvmString::new_utf8_bytes(activation.gc(), bs.name().into_inner());
8989
let node = Self::new(activation.gc(), ELEMENT_NODE, Some(name));
9090

9191
// Reverse attributes so they appear in the `PropertyMap` in their definition order.
92-
let attributes: Result<Vec<_>, _> = bs.attributes().collect();
93-
let attributes = attributes?;
92+
let attributes = bs.attributes().collect::<Result<Vec<_>, _>>()?;
93+
9494
for attribute in attributes.iter().rev() {
9595
let key = AvmString::new_utf8_bytes(activation.gc(), attribute.key.into_inner());
96-
let value_str = custom_unescape(&attribute.value, decoder)?;
97-
let value = AvmString::new_utf8_bytes(activation.gc(), value_str.as_bytes());
96+
97+
let value_str = custom_unescape(&attribute.value)
98+
.map_err(|e| quick_xml::Error::Encoding(EncodingError::Utf8(e)))?;
99+
let value = AvmString::new_utf8(activation.gc(), value_str);
98100

99101
// Insert an attribute.
100102
node.attributes()

core/src/avm2/e4x.rs

Lines changed: 84 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use crate::avm2::error::{
2-
make_error_1010, make_error_1085, make_error_1088, make_error_1118, make_unknown_ns_error,
3-
make_xml_error,
2+
XmlErrorCode, make_error_1010, make_error_1085, make_error_1088, make_error_1118,
3+
make_unknown_ns_error, make_xml_error,
44
};
55
use crate::avm2::function::FunctionArgs;
66
use crate::avm2::object::{E4XOrXml, FunctionObject, NamespaceObject};
@@ -15,8 +15,8 @@ use gc_arena::{
1515

1616
use quick_xml::{
1717
Error as XmlError, NsReader,
18-
errors::IllFormedError,
19-
events::{BytesStart, Event},
18+
errors::{IllFormedError, SyntaxError as XmlSyntaxError},
19+
events::{BytesStart, Event, attributes::AttrError as XmlAttrError},
2020
name::ResolveResult,
2121
};
2222
use ruffle_common::xml::custom_unescape;
@@ -765,41 +765,71 @@ impl<'gc> E4XNode<'gc> {
765765
loop {
766766
let event = match parser.read_event() {
767767
Ok(event) => event,
768-
Err(XmlError::IllFormed(IllFormedError::MismatchedEndTag { expected, found })) => {
769-
// We must accept </a/>, </a />, and </a b="c">
770-
// TODO: Reject </a bc>, </a//>, <a //> etc.
771-
if let Some(rest) = found.strip_prefix(&expected) {
772-
if rest.starts_with([' ', '\t', '/']) {
773-
let node = open_tags.pop().unwrap();
774-
if open_tags.is_empty() {
775-
top_level.push(node);
768+
Err(err) => {
769+
let err = match err {
770+
XmlError::IllFormed(IllFormedError::MismatchedEndTag {
771+
expected,
772+
found,
773+
}) => {
774+
// We must accept </a/>, </a />, and </a b="c">
775+
// TODO: Reject </a bc>, </a//>, <a //> etc.
776+
if let Some(rest) = found.strip_prefix(&expected) {
777+
if rest.starts_with([' ', '\t', '/']) {
778+
let node = open_tags.pop().unwrap();
779+
if open_tags.is_empty() {
780+
top_level.push(node);
781+
}
782+
continue;
783+
}
776784
}
777-
continue;
785+
make_error_1085(activation, &expected)
778786
}
779-
}
780-
return Err(make_error_1085(activation, &expected));
781-
}
782-
Err(XmlError::IllFormed(IllFormedError::UnmatchedEndTag(_)))
783-
if open_tags.is_empty() =>
784-
{
785-
return Err(make_error_1088(activation));
787+
XmlError::IllFormed(IllFormedError::UnmatchedEndTag(_))
788+
if open_tags.is_empty() =>
789+
{
790+
make_error_1088(activation)
791+
}
792+
XmlError::InvalidAttr(XmlAttrError::Duplicated(_, _)) => {
793+
make_xml_error(activation, XmlErrorCode::DuplicateAttribute)
794+
}
795+
XmlError::Syntax(syntax_error) => {
796+
let code = match syntax_error {
797+
XmlSyntaxError::UnclosedPIOrXmlDecl => {
798+
XmlErrorCode::UnterminatedProcessingInstruction
799+
}
800+
XmlSyntaxError::UnclosedComment => {
801+
XmlErrorCode::UnterminatedComment
802+
}
803+
XmlSyntaxError::UnclosedDoctype => {
804+
XmlErrorCode::UnterminatedDoctype
805+
}
806+
XmlSyntaxError::UnclosedCData => XmlErrorCode::UnterminatedCData,
807+
XmlSyntaxError::UnclosedTag => XmlErrorCode::ElementMalformed,
808+
// TODO: handle other errors properly
809+
_ => XmlErrorCode::ElementMalformed,
810+
};
811+
812+
make_xml_error(activation, code)
813+
}
814+
// TODO: handle other errors properly
815+
_ => make_xml_error(activation, XmlErrorCode::ElementMalformed),
816+
};
817+
818+
return Err(err);
786819
}
787-
Err(err) => return Err(make_xml_error(activation, err)),
788820
};
789821

790822
match &event {
791823
Event::Start(bs) => {
792-
let child =
793-
E4XNode::from_start_event(activation, &parser, bs, parser.decoder())?;
824+
let child = E4XNode::from_start_event(activation, &parser, bs)?;
794825

795826
if let Some(current_tag) = open_tags.last_mut() {
796827
current_tag.append_child(activation.gc(), child);
797828
}
798829
open_tags.push(child);
799830
}
800831
Event::Empty(bs) => {
801-
let node =
802-
E4XNode::from_start_event(activation, &parser, bs, parser.decoder())?;
832+
let node = E4XNode::from_start_event(activation, &parser, bs)?;
803833
push_childless_node(node, &mut open_tags, &mut top_level, activation);
804834
}
805835
Event::End(_) => {
@@ -810,8 +840,10 @@ impl<'gc> E4XNode<'gc> {
810840
}
811841
Event::Text(bt) => {
812842
handle_text_cdata(
813-
custom_unescape(bt, parser.decoder())
814-
.map_err(|e| make_xml_error(activation, e))?
843+
custom_unescape(bt)
844+
.map_err(|_| {
845+
make_xml_error(activation, XmlErrorCode::ElementMalformed)
846+
})?
815847
.as_bytes(),
816848
ignore_white,
817849
&mut open_tags,
@@ -836,9 +868,11 @@ impl<'gc> E4XNode<'gc> {
836868
if ignore_comments {
837869
continue;
838870
}
839-
let text = custom_unescape(bt, parser.decoder())
840-
.map_err(|e| make_xml_error(activation, e))?;
841-
let text = AvmString::new_utf8_bytes(activation.gc(), text.as_bytes());
871+
872+
let text = custom_unescape(bt)
873+
.map_err(|_| make_xml_error(activation, XmlErrorCode::ElementMalformed))?;
874+
let text = AvmString::new_utf8(activation.gc(), text);
875+
842876
let node = E4XNode(Gc::new(
843877
activation.gc(),
844878
E4XNodeData {
@@ -856,8 +890,10 @@ impl<'gc> E4XNode<'gc> {
856890
if ignore_processing_instructions {
857891
continue;
858892
}
859-
let text = custom_unescape(bt, parser.decoder())
860-
.map_err(|e| make_xml_error(activation, e))?;
893+
894+
let text = custom_unescape(bt)
895+
.map_err(|_| make_xml_error(activation, XmlErrorCode::ElementMalformed))?;
896+
861897
let (name, value) = if let Some((name, value)) = text.split_once(' ') {
862898
(
863899
AvmString::new_utf8_bytes(activation.gc(), name.as_bytes()),
@@ -910,18 +946,26 @@ impl<'gc> E4XNode<'gc> {
910946
activation: &mut Activation<'_, 'gc>,
911947
parser: &NsReader<&[u8]>,
912948
bs: &BytesStart<'_>,
913-
decoder: quick_xml::Decoder,
914949
) -> Result<Self, Error<'gc>> {
915950
let mut attribute_nodes = Vec::new();
916951
let mut namespaces = Vec::new();
917952

918-
let attributes: Result<Vec<_>, _> = bs.attributes().collect();
919-
for attribute in
920-
attributes.map_err(|e| make_xml_error(activation, XmlError::InvalidAttr(e)))?
921-
{
922-
let value_str = custom_unescape(&attribute.value, decoder)
923-
.map_err(|e| make_xml_error(activation, e))?;
924-
let value = AvmString::new_utf8_bytes(activation.gc(), value_str.as_bytes());
953+
let attributes = bs
954+
.attributes()
955+
.collect::<Result<Vec<_>, XmlAttrError>>()
956+
.map_err(|e| {
957+
let code = match e {
958+
XmlAttrError::Duplicated(_, _) => XmlErrorCode::DuplicateAttribute,
959+
_ => XmlErrorCode::ElementMalformed,
960+
};
961+
962+
make_xml_error(activation, code)
963+
})?;
964+
965+
for attribute in attributes {
966+
let value_str = custom_unescape(&attribute.value)
967+
.map_err(|_| make_xml_error(activation, XmlErrorCode::ElementMalformed))?;
968+
let value = AvmString::new_utf8(activation.gc(), value_str);
925969

926970
let (ns, local_name) = parser.resolve_attribute(attribute.key);
927971

0 commit comments

Comments
 (0)