Skip to content

Commit 37b4db6

Browse files
committed
Validate JATS input before parsing in Crossref path; fix double-escaped attributes in write_jats_content
1 parent d64df43 commit 37b4db6

2 files changed

Lines changed: 106 additions & 25 deletions

File tree

thoth-api/src/markup/mod.rs

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,7 @@ pub fn convert_to_jats(
392392
/// normalise stored abstract-like markup into the subset we safely emit to Crossref.
393393
pub fn normalise_crossref_abstract_jats(content: &str) -> ThothResult<String> {
394394
let ast = if looks_like_markup(content) {
395+
validate_jats_subset(content, ConversionLimit::Abstract)?;
395396
jats_to_ast(content)
396397
} else {
397398
plain_text_to_ast(content)
@@ -792,10 +793,16 @@ mod tests {
792793
}
793794

794795
#[test]
795-
fn test_normalise_crossref_abstract_jats_splits_breaks_and_removes_empty_paragraphs() {
796-
let input = "<p></p><p>First line<break/>Second line</p>";
796+
fn test_normalise_crossref_abstract_jats_removes_empty_paragraphs() {
797+
let input = "<p></p><p>First line</p>";
797798
let output = normalise_crossref_abstract_jats(input).unwrap();
798-
assert_eq!(output, "<p>First line</p><p>Second line</p>");
799+
assert_eq!(output, "<p>First line</p>");
800+
}
801+
802+
#[test]
803+
fn test_normalise_crossref_abstract_jats_rejects_break_elements() {
804+
let input = "<p>First line<break/>Second line</p>";
805+
assert!(normalise_crossref_abstract_jats(input).is_err());
799806
}
800807

801808
#[test]
@@ -804,6 +811,32 @@ mod tests {
804811
let output = normalise_crossref_abstract_jats(input).unwrap();
805812
assert_eq!(output, "<p>This has <bold>bold</bold> text.</p>");
806813
}
814+
815+
#[test]
816+
fn test_normalise_crossref_abstract_jats_preserves_existing_entities_once() {
817+
let input = "<p>x &amp; y &lt; z &gt; w</p>";
818+
let output = normalise_crossref_abstract_jats(input).unwrap();
819+
assert_eq!(output, "<p>x &amp; y &lt; z &gt; w</p>");
820+
}
821+
822+
#[test]
823+
fn test_normalise_crossref_abstract_jats_rejects_malformed_mixed_markup() {
824+
let input = "<p>Range 1 < 2 and <italic>styled</italic></p>";
825+
assert!(normalise_crossref_abstract_jats(input).is_err());
826+
}
827+
828+
#[test]
829+
fn test_markdown_abstract_escapes_reserved_xml_chars() {
830+
let input = "x < y & z > w";
831+
let output = convert_to_jats(
832+
input.to_string(),
833+
MarkupFormat::Markdown,
834+
ConversionLimit::Abstract,
835+
)
836+
.unwrap();
837+
838+
assert_eq!(output, "<p>x &lt; y &amp; z &gt; w</p>");
839+
}
807840
// --- convert_to_jats tests end ---
808841

809842
// --- convert_from_jats tests start ---

thoth-export-server/src/xml/doideposit_crossref.rs

Lines changed: 70 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -325,16 +325,31 @@ fn write_jats_content<W: Write>(content: &str, w: &mut EventWriter<W>) -> ThothR
325325
let mut event_builder = XmlEvent::start_element(&*name);
326326

327327
// Add attributes
328-
let attrs: Vec<(String, String)> = e
328+
let attrs: ThothResult<Vec<(String, String)>> = e
329329
.attributes()
330-
.flatten()
331330
.map(|attr| {
332-
(
331+
let attr = attr.map_err(|err| {
332+
ThothError::InternalError(format!(
333+
"Error parsing JATS content attributes: {}",
334+
err
335+
))
336+
})?;
337+
let value = attr
338+
.decode_and_unescape_value(reader.decoder())
339+
.map_err(|err| {
340+
ThothError::InternalError(format!(
341+
"Error decoding JATS content attributes: {}",
342+
err
343+
))
344+
})?
345+
.into_owned();
346+
Ok((
333347
String::from_utf8_lossy(attr.key.as_ref()).to_string(),
334-
String::from_utf8_lossy(&attr.value).to_string(),
335-
)
348+
value,
349+
))
336350
})
337351
.collect();
352+
let attrs = attrs?;
338353

339354
for (key, value) in &attrs {
340355
event_builder = event_builder.attr(key.as_str(), value.as_str());
@@ -357,16 +372,31 @@ fn write_jats_content<W: Write>(content: &str, w: &mut EventWriter<W>) -> ThothR
357372
let mut event_builder = XmlEvent::start_element(&*name);
358373

359374
// Add attributes
360-
let attrs: Vec<(String, String)> = e
375+
let attrs: ThothResult<Vec<(String, String)>> = e
361376
.attributes()
362-
.flatten()
363377
.map(|attr| {
364-
(
378+
let attr = attr.map_err(|err| {
379+
ThothError::InternalError(format!(
380+
"Error parsing JATS content attributes: {}",
381+
err
382+
))
383+
})?;
384+
let value = attr
385+
.decode_and_unescape_value(reader.decoder())
386+
.map_err(|err| {
387+
ThothError::InternalError(format!(
388+
"Error decoding JATS content attributes: {}",
389+
err
390+
))
391+
})?
392+
.into_owned();
393+
Ok((
365394
String::from_utf8_lossy(attr.key.as_ref()).to_string(),
366-
String::from_utf8_lossy(&attr.value).to_string(),
367-
)
395+
value,
396+
))
368397
})
369398
.collect();
399+
let attrs = attrs?;
370400

371401
for (key, value) in &attrs {
372402
event_builder = event_builder.attr(key.as_str(), value.as_str());
@@ -2487,7 +2517,7 @@ mod tests {
24872517
// Should not contain any paragraph elements
24882518
assert!(!output.contains(r#"<jats:p>"#));
24892519

2490-
// Nested paragraph wrappers should be flattened before writing.
2520+
// Nested paragraph wrappers are invalid JATS and should be rejected.
24912521
let mut buffer = Vec::new();
24922522
let mut writer = xml::writer::EmitterConfig::new()
24932523
.perform_indent(true)
@@ -2500,13 +2530,9 @@ mod tests {
25002530
&mut writer,
25012531
);
25022532

2503-
assert!(result.is_ok());
2504-
let output = String::from_utf8(buffer).unwrap();
2505-
assert!(output.contains(r#"<jats:p>Nested paragraph.</jats:p>"#));
2506-
assert!(!output.contains(r#"<jats:p><jats:p>"#));
2507-
assert!(!output.contains(r#"<jats:p />"#));
2533+
assert!(result.is_err());
25082534

2509-
// Break elements should be converted into sibling paragraphs.
2535+
// Break elements are invalid JATS and should be rejected.
25102536
let mut buffer = Vec::new();
25112537
let mut writer = xml::writer::EmitterConfig::new()
25122538
.perform_indent(true)
@@ -2519,11 +2545,7 @@ mod tests {
25192545
&mut writer,
25202546
);
25212547

2522-
assert!(result.is_ok());
2523-
let output = String::from_utf8(buffer).unwrap();
2524-
assert!(output.contains(r#"<jats:p>First line</jats:p>"#));
2525-
assert!(output.contains(r#"<jats:p>Second line</jats:p>"#));
2526-
assert!(!output.contains(r#"<jats:break"#));
2548+
assert!(result.is_err());
25272549

25282550
// Locale codes written to xml:lang should use BCP 47 hyphen separators.
25292551
let mut buffer = Vec::new();
@@ -2563,6 +2585,32 @@ mod tests {
25632585
assert!(error.contains("Invalid Crossref abstract markup"));
25642586
}
25652587

2588+
#[test]
2589+
fn test_write_abstract_content_with_locale_code_escapes_link_attributes_once() {
2590+
let mut buffer = Vec::new();
2591+
let mut writer = xml::writer::EmitterConfig::new()
2592+
.perform_indent(true)
2593+
.create_writer(&mut buffer);
2594+
2595+
let result = write_abstract_content_with_locale_code(
2596+
r#"<p><ext-link xlink:href="https://example.org?a=1&amp;b=2&amp;quote=&quot;hi&quot;&amp;apos=&apos;ok&apos;">link</ext-link></p>"#,
2597+
"long",
2598+
"EN",
2599+
&mut writer,
2600+
);
2601+
2602+
assert!(result.is_ok());
2603+
let output = String::from_utf8(buffer).unwrap();
2604+
assert!(
2605+
output.contains(
2606+
r#"<jats:ext-link xlink:href="https://example.org?a=1&amp;b=2&amp;quote=&quot;hi&quot;&amp;apos=&apos;ok&apos;">link</jats:ext-link>"#
2607+
)
2608+
);
2609+
assert!(!output.contains("&amp;amp;"));
2610+
assert!(!output.contains("&amp;quot;"));
2611+
assert!(!output.contains("&amp;apos;"));
2612+
}
2613+
25662614
#[test]
25672615
// Crossref previously limited the number of ISBNs that could be included in a deposit file to 6,
25682616
// but this has now been increased in schema version 5.4.0 to 100 (which will never become relevant).

0 commit comments

Comments
 (0)