Skip to content

Commit 6a48a28

Browse files
committed
Allow to have attributes in closing tags (compatibility with the Adobe Flash parser)
1 parent 45e8be4 commit 6a48a28

File tree

5 files changed

+117
-218
lines changed

5 files changed

+117
-218
lines changed

Changelog.md

+2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
### Misc Changes
2525

2626
- [#780]: `reader::Parser`, `reader::ElementParser` and `reader::PiParser` moved to the new module `parser`.
27+
- [#776]: Allow to have attributes in the end tag for compatibility reasons with Adobe Flash XML parser.
2728

29+
[#776]: https://github.com/tafia/quick-xml/issues/776
2830
[#780]: https://github.com/tafia/quick-xml/pull/780
2931
[#781]: https://github.com/tafia/quick-xml/pull/781
3032

src/reader/buffered_reader.rs

-48
Original file line numberDiff line numberDiff line change
@@ -101,54 +101,6 @@ macro_rules! impl_buffered_source {
101101
ReadTextResult::UpToEof(&buf[start..])
102102
}
103103

104-
#[inline]
105-
$($async)? fn read_bytes_until $(<$lf>)? (
106-
&mut self,
107-
byte: u8,
108-
buf: &'b mut Vec<u8>,
109-
position: &mut u64,
110-
) -> io::Result<(&'b [u8], bool)> {
111-
// search byte must be within the ascii range
112-
debug_assert!(byte.is_ascii());
113-
114-
let mut read = 0;
115-
let start = buf.len();
116-
loop {
117-
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
118-
Ok(n) if n.is_empty() => break,
119-
Ok(n) => n,
120-
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
121-
Err(e) => {
122-
*position += read;
123-
return Err(e);
124-
}
125-
};
126-
127-
match memchr::memchr(byte, available) {
128-
Some(i) => {
129-
buf.extend_from_slice(&available[..i]);
130-
131-
let used = i + 1;
132-
self $(.$reader)? .consume(used);
133-
read += used as u64;
134-
135-
*position += read;
136-
return Ok((&buf[start..], true));
137-
}
138-
None => {
139-
buf.extend_from_slice(available);
140-
141-
let used = available.len();
142-
self $(.$reader)? .consume(used);
143-
read += used as u64;
144-
}
145-
}
146-
}
147-
148-
*position += read;
149-
Ok((&buf[start..], false))
150-
}
151-
152104
#[inline]
153105
$($async)? fn read_with<$($lf,)? P: Parser>(
154106
&mut self,

src/reader/mod.rs

+88-147
Original file line numberDiff line numberDiff line change
@@ -345,18 +345,26 @@ macro_rules! read_until_close {
345345
}
346346
},
347347
// `</` - closing tag
348+
// #776: We parse using ElementParser which allows us to have attributes
349+
// in close tags. While such tags are not allowed by the specification,
350+
// we anyway allow to parse them because:
351+
// - we do not check constraints during parsing. This is performed by the
352+
// optional validate step which user should call manually
353+
// - if we just look for `>` we will parse `</tag attr=">" >` as end tag
354+
// `</tag attr=">` and text `" >` which probably no one existing parser
355+
// does. This is malformed XML, however it is tolerated by some parsers
356+
// (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
348357
Ok(Some(b'/')) => match $reader
349-
.read_bytes_until(b'>', $buf, &mut $self.state.offset)
358+
.read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
350359
$(.$await)?
351360
{
352-
Ok((bytes, true)) => $self.state.emit_end(bytes),
353-
Ok((_, false)) => {
361+
Ok(bytes) => $self.state.emit_end(bytes),
362+
Err(e) => {
354363
// We want to report error at `<`, but offset was increased,
355364
// so return it back (-1 for `<`)
356365
$self.state.last_error_offset = start - 1;
357-
Err(Error::Syntax(SyntaxError::UnclosedTag))
366+
Err(e)
358367
}
359-
Err(e) => Err(Error::Io(e.into())),
360368
},
361369
// `<?` - processing instruction
362370
Ok(Some(b'?')) => match $reader
@@ -824,39 +832,6 @@ trait XmlSource<'r, B> {
824832
/// [events]: crate::events::Event
825833
fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
826834

827-
/// Read input until `byte` is found or end of input is reached.
828-
///
829-
/// Returns a slice of data read up to `byte` (exclusive),
830-
/// and a flag noting whether `byte` was found in the input or not.
831-
///
832-
/// # Example
833-
///
834-
/// ```ignore
835-
/// let mut position = 0;
836-
/// let mut input = b"abc*def".as_ref();
837-
/// // ^= 4
838-
///
839-
/// assert_eq!(
840-
/// input.read_bytes_until(b'*', (), &mut position).unwrap(),
841-
/// (b"abc".as_ref(), true)
842-
/// );
843-
/// assert_eq!(position, 4); // position after the symbol matched
844-
/// ```
845-
///
846-
/// # Parameters
847-
/// - `byte`: Byte for search
848-
/// - `buf`: Buffer that could be filled from an input (`Self`) and
849-
/// from which [events] could borrow their data
850-
/// - `position`: Will be increased by amount of bytes consumed
851-
///
852-
/// [events]: crate::events::Event
853-
fn read_bytes_until(
854-
&mut self,
855-
byte: u8,
856-
buf: B,
857-
position: &mut u64,
858-
) -> io::Result<(&'r [u8], bool)>;
859-
860835
/// Read input until processing instruction is finished.
861836
///
862837
/// This method expect that start sequence of a parser already was read.
@@ -1022,115 +997,6 @@ mod test {
1022997
$buf:expr
1023998
$(, $async:ident, $await:ident)?
1024999
) => {
1025-
mod read_bytes_until {
1026-
use super::*;
1027-
// Use Bytes for printing bytes as strings for ASCII range
1028-
use crate::utils::Bytes;
1029-
use pretty_assertions::assert_eq;
1030-
1031-
/// Checks that search in the empty buffer returns `None`
1032-
#[$test]
1033-
$($async)? fn empty() {
1034-
let buf = $buf;
1035-
let mut position = 0;
1036-
let mut input = b"".as_ref();
1037-
// ^= 0
1038-
1039-
let (bytes, found) = $source(&mut input)
1040-
.read_bytes_until(b'*', buf, &mut position)
1041-
$(.$await)?
1042-
.unwrap();
1043-
assert_eq!(
1044-
(Bytes(bytes), found),
1045-
(Bytes(b""), false)
1046-
);
1047-
assert_eq!(position, 0);
1048-
}
1049-
1050-
/// Checks that search in the buffer non-existent value returns entire buffer
1051-
/// as a result and set `position` to `len()`
1052-
#[$test]
1053-
$($async)? fn non_existent() {
1054-
let buf = $buf;
1055-
let mut position = 0;
1056-
let mut input = b"abcdef".as_ref();
1057-
// ^= 6
1058-
1059-
let (bytes, found) = $source(&mut input)
1060-
.read_bytes_until(b'*', buf, &mut position)
1061-
$(.$await)?
1062-
.unwrap();
1063-
assert_eq!(
1064-
(Bytes(bytes), found),
1065-
(Bytes(b"abcdef"), false)
1066-
);
1067-
assert_eq!(position, 6);
1068-
}
1069-
1070-
/// Checks that search in the buffer an element that is located in the front of
1071-
/// buffer returns empty slice as a result and set `position` to one symbol
1072-
/// after match (`1`)
1073-
#[$test]
1074-
$($async)? fn at_the_start() {
1075-
let buf = $buf;
1076-
let mut position = 0;
1077-
let mut input = b"*abcdef".as_ref();
1078-
// ^= 1
1079-
1080-
let (bytes, found) = $source(&mut input)
1081-
.read_bytes_until(b'*', buf, &mut position)
1082-
$(.$await)?
1083-
.unwrap();
1084-
assert_eq!(
1085-
(Bytes(bytes), found),
1086-
(Bytes(b""), true)
1087-
);
1088-
assert_eq!(position, 1); // position after the symbol matched
1089-
}
1090-
1091-
/// Checks that search in the buffer an element that is located in the middle of
1092-
/// buffer returns slice before that symbol as a result and set `position` to one
1093-
/// symbol after match
1094-
#[$test]
1095-
$($async)? fn inside() {
1096-
let buf = $buf;
1097-
let mut position = 0;
1098-
let mut input = b"abc*def".as_ref();
1099-
// ^= 4
1100-
1101-
let (bytes, found) = $source(&mut input)
1102-
.read_bytes_until(b'*', buf, &mut position)
1103-
$(.$await)?
1104-
.unwrap();
1105-
assert_eq!(
1106-
(Bytes(bytes), found),
1107-
(Bytes(b"abc"), true)
1108-
);
1109-
assert_eq!(position, 4); // position after the symbol matched
1110-
}
1111-
1112-
/// Checks that search in the buffer an element that is located in the end of
1113-
/// buffer returns slice before that symbol as a result and set `position` to one
1114-
/// symbol after match (`len()`)
1115-
#[$test]
1116-
$($async)? fn in_the_end() {
1117-
let buf = $buf;
1118-
let mut position = 0;
1119-
let mut input = b"abcdef*".as_ref();
1120-
// ^= 7
1121-
1122-
let (bytes, found) = $source(&mut input)
1123-
.read_bytes_until(b'*', buf, &mut position)
1124-
$(.$await)?
1125-
.unwrap();
1126-
assert_eq!(
1127-
(Bytes(bytes), found),
1128-
(Bytes(b"abcdef"), true)
1129-
);
1130-
assert_eq!(position, 7); // position after the symbol matched
1131-
}
1132-
}
1133-
11341000
mod read_bang_element {
11351001
use super::*;
11361002
use crate::errors::{Error, SyntaxError};
@@ -1693,6 +1559,81 @@ mod test {
16931559
assert_eq!(position, 42);
16941560
}
16951561
}
1562+
1563+
mod close {
1564+
use super::*;
1565+
use pretty_assertions::assert_eq;
1566+
1567+
#[$test]
1568+
$($async)? fn empty_tag() {
1569+
let buf = $buf;
1570+
let mut position = 1;
1571+
let mut input = b"/ >".as_ref();
1572+
// ^= 4
1573+
1574+
assert_eq!(
1575+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1576+
Bytes(b"/ ")
1577+
);
1578+
assert_eq!(position, 4);
1579+
}
1580+
1581+
#[$test]
1582+
$($async)? fn normal() {
1583+
let buf = $buf;
1584+
let mut position = 1;
1585+
let mut input = b"/tag>".as_ref();
1586+
// ^= 6
1587+
1588+
assert_eq!(
1589+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1590+
Bytes(b"/tag")
1591+
);
1592+
assert_eq!(position, 6);
1593+
}
1594+
1595+
#[$test]
1596+
$($async)? fn empty_ns_empty_tag() {
1597+
let buf = $buf;
1598+
let mut position = 1;
1599+
let mut input = b"/:>".as_ref();
1600+
// ^= 4
1601+
1602+
assert_eq!(
1603+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1604+
Bytes(b"/:")
1605+
);
1606+
assert_eq!(position, 4);
1607+
}
1608+
1609+
#[$test]
1610+
$($async)? fn empty_ns() {
1611+
let buf = $buf;
1612+
let mut position = 1;
1613+
let mut input = b"/:tag>".as_ref();
1614+
// ^= 7
1615+
1616+
assert_eq!(
1617+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1618+
Bytes(b"/:tag")
1619+
);
1620+
assert_eq!(position, 7);
1621+
}
1622+
1623+
#[$test]
1624+
$($async)? fn with_attributes() {
1625+
let buf = $buf;
1626+
let mut position = 1;
1627+
let mut input = br#"/tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1628+
// ^= 40
1629+
1630+
assert_eq!(
1631+
Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1632+
Bytes(br#"/tag attr-1=">" attr2 = '>' 3attr"#)
1633+
);
1634+
assert_eq!(position, 40);
1635+
}
1636+
}
16961637
}
16971638

16981639
/// Ensures, that no empty `Text` events are generated

src/reader/slice_reader.rs

-23
Original file line numberDiff line numberDiff line change
@@ -284,29 +284,6 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
284284
}
285285
}
286286

287-
#[inline]
288-
fn read_bytes_until(
289-
&mut self,
290-
byte: u8,
291-
_buf: (),
292-
position: &mut u64,
293-
) -> io::Result<(&'a [u8], bool)> {
294-
// search byte must be within the ascii range
295-
debug_assert!(byte.is_ascii());
296-
297-
if let Some(i) = memchr::memchr(byte, self) {
298-
*position += i as u64 + 1;
299-
let bytes = &self[..i];
300-
*self = &self[i + 1..];
301-
Ok((bytes, true))
302-
} else {
303-
*position += self.len() as u64;
304-
let bytes = &self[..];
305-
*self = &[];
306-
Ok((bytes, false))
307-
}
308-
}
309-
310287
#[inline]
311288
fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
312289
where

tests/issues.rs

+27
Original file line numberDiff line numberDiff line change
@@ -364,3 +364,30 @@ fn issue774() {
364364
Event::End(BytesEnd::new("tag"))
365365
);
366366
}
367+
368+
/// Regression test for https://github.com/tafia/quick-xml/issues/776
369+
#[test]
370+
fn issue776() {
371+
let mut reader = Reader::from_str(r#"<tag></tag/><tag></tag attr=">">"#);
372+
// We still think that the name of the end tag is everything between `</` and `>`
373+
// and if we do not disable this check we get error
374+
reader.config_mut().check_end_names = false;
375+
376+
assert_eq!(
377+
reader.read_event().unwrap(),
378+
Event::Start(BytesStart::new("tag"))
379+
);
380+
assert_eq!(
381+
reader.read_event().unwrap(),
382+
Event::End(BytesEnd::new("tag/"))
383+
);
384+
385+
assert_eq!(
386+
reader.read_event().unwrap(),
387+
Event::Start(BytesStart::new("tag"))
388+
);
389+
assert_eq!(
390+
reader.read_event().unwrap(),
391+
Event::End(BytesEnd::new(r#"tag attr=">""#))
392+
);
393+
}

0 commit comments

Comments
 (0)