Skip to content

Commit dde65d2

Browse files
committed
Extract element utility fns to element_util.rs
Makes `interface.rs` file easier to navigate. Also extracted `enable_logs` to `logs.rs`. This is a refactoring that I've extracted from daisy#374, where these functions are used by both the stateful and the stateless interface and I wanted to avoid any dependencies from `stateless_interface` on `interface`.
1 parent ca4b933 commit dde65d2

File tree

10 files changed

+367
-355
lines changed

10 files changed

+367
-355
lines changed

src/canonicalize.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4392,7 +4392,7 @@ mod canonicalize_tests {
43924392

43934393
#[test]
43944394
fn illegal_mathml_element() {
4395-
use crate::interface::*;
4395+
use crate::element_util::{get_element, trim_element};
43964396
let test_str = "<math><foo><mi>f</mi></foo></math>";
43974397
let package1 = &parser::parse(test_str).expect("Failed to parse test input");
43984398
let mathml = get_element(package1);
@@ -4495,7 +4495,7 @@ mod canonicalize_tests {
44954495

44964496
#[test]
44974497
fn mrow_with_intent_and_single_child() {
4498-
use crate::interface::*;
4498+
use crate::element_util::{get_element, trim_element};
44994499
use sxd_document::parser;
45004500
use crate::canonicalize::canonicalize;
45014501
// this forces initialization
@@ -4519,7 +4519,7 @@ mod canonicalize_tests {
45194519
#[test]
45204520
fn empty_mrow_with_intent() {
45214521
// we don't want to remove the mrow because the intent on the mi would reference itself
4522-
use crate::interface::*;
4522+
use crate::element_util::{get_element, trim_element};
45234523
use sxd_document::parser;
45244524
use crate::canonicalize::canonicalize;
45254525
// this forces initialization

src/chemistry.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1818,7 +1818,7 @@ mod chem_tests {
18181818
fn parse_mathml_string<F>(test: &str, test_mathml: F) -> bool
18191819
where F: Fn(Element) -> bool {
18201820
use sxd_document::parser;
1821-
use crate::interface::{get_element, trim_element};
1821+
use crate::element_util::{get_element, trim_element};
18221822

18231823
let new_package = parser::parse(&test);
18241824
if let Err(e) = new_package {

src/element_util.rs

Lines changed: 326 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,326 @@
1+
use crate::canonicalize::{as_element, name};
2+
use crate::errors::*;
3+
use crate::logs::enable_logs;
4+
use crate::xpath_functions::{is_leaf, IsNode};
5+
use lazy_static::lazy_static;
6+
use regex::Regex;
7+
use sxd_document::{Package};
8+
use sxd_document::dom::*;
9+
10+
pub fn get_element(package: &Package) -> Element {
11+
enable_logs();
12+
let doc = package.as_document();
13+
let mut result = None;
14+
for root_child in doc.root().children() {
15+
if let ChildOfRoot::Element(e) = root_child {
16+
assert!(result.is_none());
17+
result = Some(e);
18+
}
19+
}
20+
return result.unwrap();
21+
}
22+
23+
/// returns Ok() if two Documents are equal or some info where they differ in the Err
24+
// Not really meant to be public -- used by tests in some packages
25+
#[allow(dead_code)]
26+
pub fn is_same_element(e1: Element, e2: Element) -> Result<()> {
27+
enable_logs();
28+
if name(e1) != name(e2) {
29+
bail!("Names not the same: {}, {}", name(e1), name(e2));
30+
}
31+
32+
// assume 'e' doesn't have element children until proven otherwise
33+
// this means we keep Text children until we are proven they aren't needed
34+
if e1.children().len() != e2.children().len() {
35+
bail!(
36+
"Children of {} have {} != {} children",
37+
name(e1),
38+
e1.children().len(),
39+
e2.children().len()
40+
);
41+
}
42+
43+
if let Err(e) = attrs_are_same(e1.attributes(), e2.attributes()) {
44+
bail!("In element {}, {}", name(e1), e);
45+
}
46+
47+
for (i, (c1, c2)) in e1.children().iter().zip(e2.children().iter()).enumerate() {
48+
match c1 {
49+
ChildOfElement::Element(child1) => {
50+
if let ChildOfElement::Element(child2) = c2 {
51+
is_same_element(*child1, *child2)?;
52+
} else {
53+
bail!("{} child #{}, first is element, second is something else", name(e1), i);
54+
}
55+
}
56+
ChildOfElement::Comment(com1) => {
57+
if let ChildOfElement::Comment(com2) = c2 {
58+
if com1.text() != com2.text() {
59+
bail!("{} child #{} -- comment text differs", name(e1), i);
60+
}
61+
} else {
62+
bail!("{} child #{}, first is comment, second is something else", name(e1), i);
63+
}
64+
}
65+
ChildOfElement::ProcessingInstruction(p1) => {
66+
if let ChildOfElement::ProcessingInstruction(p2) = c2 {
67+
if p1.target() != p2.target() || p1.value() != p2.value() {
68+
bail!("{} child #{} -- processing instruction differs", name(e1), i);
69+
}
70+
} else {
71+
bail!(
72+
"{} child #{}, first is processing instruction, second is something else",
73+
name(e1),
74+
i
75+
);
76+
}
77+
}
78+
ChildOfElement::Text(t1) => {
79+
if let ChildOfElement::Text(t2) = c2 {
80+
if t1.text() != t2.text() {
81+
bail!("{} child #{} -- text differs", name(e1), i);
82+
}
83+
} else {
84+
bail!("{} child #{}, first is text, second is something else", name(e1), i);
85+
}
86+
}
87+
}
88+
}
89+
return Ok(());
90+
91+
/// compares attributes -- '==' didn't seems to work
92+
fn attrs_are_same(attrs1: Vec<Attribute>, attrs2: Vec<Attribute>) -> Result<()> {
93+
if attrs1.len() != attrs2.len() {
94+
bail!("Attributes have different length: {:?} != {:?}", attrs1, attrs2);
95+
}
96+
// can't guarantee attrs are in the same order
97+
for attr1 in attrs1 {
98+
if let Some(found_attr2) = attrs2
99+
.iter()
100+
.find(|&attr2| attr1.name().local_part() == attr2.name().local_part())
101+
{
102+
if attr1.value() == found_attr2.value() {
103+
continue;
104+
} else {
105+
bail!(
106+
"Attribute named {} has differing values:\n '{}'\n '{}'",
107+
attr1.name().local_part(),
108+
attr1.value(),
109+
found_attr2.value()
110+
);
111+
}
112+
} else {
113+
bail!(
114+
"Attribute name {} not in [{}]",
115+
print_attr(&attr1),
116+
print_attrs(&attrs2)
117+
);
118+
}
119+
}
120+
return Ok(());
121+
122+
fn print_attr(attr: &Attribute) -> String {
123+
return format!("@{}='{}'", attr.name().local_part(), attr.value());
124+
}
125+
fn print_attrs(attrs: &[Attribute]) -> String {
126+
return attrs.iter().map(print_attr).collect::<Vec<String>>().join(", ");
127+
}
128+
}
129+
}
130+
131+
// used for testing trim
132+
/// returns Ok() if two Documents are equal or some info where they differ in the Err
133+
#[allow(dead_code)]
134+
pub(crate) fn is_same_doc(doc1: &Document, doc2: &Document) -> Result<()> {
135+
// assume 'e' doesn't have element children until proven otherwise
136+
// this means we keep Text children until we are proven they aren't needed
137+
if doc1.root().children().len() != doc2.root().children().len() {
138+
bail!(
139+
"Children of docs have {} != {} children",
140+
doc1.root().children().len(),
141+
doc2.root().children().len()
142+
);
143+
}
144+
145+
for (i, (c1, c2)) in doc1
146+
.root()
147+
.children()
148+
.iter()
149+
.zip(doc2.root().children().iter())
150+
.enumerate()
151+
{
152+
match c1 {
153+
ChildOfRoot::Element(e1) => {
154+
if let ChildOfRoot::Element(e2) = c2 {
155+
is_same_element(*e1, *e2)?;
156+
} else {
157+
bail!("child #{}, first is element, second is something else", i);
158+
}
159+
}
160+
ChildOfRoot::Comment(com1) => {
161+
if let ChildOfRoot::Comment(com2) = c2 {
162+
if com1.text() != com2.text() {
163+
bail!("child #{} -- comment text differs", i);
164+
}
165+
} else {
166+
bail!("child #{}, first is comment, second is something else", i);
167+
}
168+
}
169+
ChildOfRoot::ProcessingInstruction(p1) => {
170+
if let ChildOfRoot::ProcessingInstruction(p2) = c2 {
171+
if p1.target() != p2.target() || p1.value() != p2.value() {
172+
bail!("child #{} -- processing instruction differs", i);
173+
}
174+
} else {
175+
bail!(
176+
"child #{}, first is processing instruction, second is something else",
177+
i
178+
);
179+
}
180+
}
181+
}
182+
}
183+
return Ok(());
184+
}
185+
186+
/// Not really meant to be public -- used by tests in some packages
187+
pub fn trim_element(e: Element, allow_structure_in_leaves: bool) {
188+
// "<mtext>this is text</mtext" results in 3 text children
189+
// these are combined into one child as it makes code downstream simpler
190+
191+
// space, tab, newline, carriage return all get collapsed to a single space
192+
const WHITESPACE: &[char] = &[' ', '\u{0009}', '\u{000A}', '\u{000D}'];
193+
lazy_static! {
194+
static ref WHITESPACE_MATCH: Regex = Regex::new(r#"[ \u{0009}\u{000A}\u{000D}]+"#).unwrap();
195+
}
196+
197+
if is_leaf(e) && (!allow_structure_in_leaves || IsNode::is_mathml(e)) {
198+
// Assume it is HTML inside of the leaf -- turn the HTML into a string
199+
make_leaf_element(e);
200+
return;
201+
}
202+
203+
let mut single_text = "".to_string();
204+
for child in e.children() {
205+
match child {
206+
ChildOfElement::Element(c) => {
207+
trim_element(c, allow_structure_in_leaves);
208+
}
209+
ChildOfElement::Text(t) => {
210+
single_text += t.text();
211+
e.remove_child(child);
212+
}
213+
_ => {
214+
e.remove_child(child);
215+
}
216+
}
217+
}
218+
219+
// CSS considers only space, tab, linefeed, and carriage return as collapsable whitespace
220+
if !(is_leaf(e) || name(e) == "intent-literal" || single_text.is_empty()) {
221+
// intent-literal comes from testing intent
222+
// FIX: we have a problem -- what should happen???
223+
// FIX: For now, just keep the children and ignore the text and log an error -- shouldn't panic/crash
224+
if !single_text.trim_matches(WHITESPACE).is_empty() {
225+
error!(
226+
"trim_element: both element and textual children which shouldn't happen -- ignoring text '{}'",
227+
single_text
228+
);
229+
}
230+
return;
231+
}
232+
if e.children().is_empty() && !single_text.is_empty() {
233+
// debug!("Combining text in {}: '{}' -> '{}'", e.name().local_part(), single_text, trimmed_text);
234+
e.set_text(&WHITESPACE_MATCH.replace_all(&single_text, " "));
235+
}
236+
237+
fn make_leaf_element(mathml_leaf: Element) {
238+
// MathML leaves like <mn> really shouldn't have non-textual content, but you could have embedded HTML
239+
// Here, we take convert them to leaves by grabbing up all the text and making that the content
240+
// Potentially, we leave them and let (default) rules do something, but it makes other parts of the code
241+
// messier because checking the text of a leaf becomes Option<&str> rather than just &str
242+
let children = mathml_leaf.children();
243+
if children.is_empty() {
244+
return;
245+
}
246+
247+
// gather up the text
248+
let mut text = "".to_string();
249+
for child in children {
250+
let child_text = match child {
251+
ChildOfElement::Element(child) => {
252+
if name(child) == "mglyph" {
253+
child.attribute_value("alt").unwrap_or("").to_string()
254+
} else {
255+
gather_text(child)
256+
}
257+
}
258+
ChildOfElement::Text(t) => {
259+
// debug!("ChildOfElement::Text: '{}'", t.text());
260+
t.text().to_string()
261+
}
262+
_ => "".to_string(),
263+
};
264+
if !child_text.is_empty() {
265+
text += &child_text;
266+
}
267+
}
268+
269+
// get rid of the old children and replace with the text we just built
270+
mathml_leaf.clear_children();
271+
mathml_leaf.set_text(WHITESPACE_MATCH.replace_all(&text, " ").trim_matches(WHITESPACE));
272+
// debug!("make_leaf_element: text is '{}'", crate::canonicalize::as_text(mathml_leaf));
273+
274+
/// gather up all the contents of the element and return them with a leading space
275+
fn gather_text(html: Element) -> String {
276+
let mut text = "".to_string(); // since we are throwing out the element tag, add a space between the contents
277+
for child in html.children() {
278+
match child {
279+
ChildOfElement::Element(child) => {
280+
text += &gather_text(child);
281+
}
282+
ChildOfElement::Text(t) => text += t.text(),
283+
_ => (),
284+
}
285+
}
286+
// debug!("gather_text: '{}'", text);
287+
return text;
288+
}
289+
}
290+
}
291+
292+
pub(crate) fn add_ids(mathml: Element) -> Element {
293+
use std::time::SystemTime;
294+
let time = if cfg!(target_family = "wasm") {
295+
fastrand::usize(..)
296+
} else {
297+
SystemTime::now()
298+
.duration_since(SystemTime::UNIX_EPOCH)
299+
.unwrap()
300+
.as_millis() as usize
301+
};
302+
let time_part = radix_fmt::radix(time, 36).to_string();
303+
let random_part = radix_fmt::radix(fastrand::u32(..), 36).to_string();
304+
let prefix = "M".to_string() + &time_part[time_part.len() - 3..] + &random_part[random_part.len() - 4..] + "-"; // begin with letter
305+
add_ids_to_all(mathml, &prefix, 0);
306+
return mathml;
307+
308+
fn add_ids_to_all(mathml: Element, id_prefix: &str, count: usize) -> usize {
309+
let mut count = count;
310+
if mathml.attribute("id").is_none() {
311+
mathml.set_attribute_value("id", (id_prefix.to_string() + &count.to_string()).as_str());
312+
mathml.set_attribute_value("data-id-added", "true");
313+
count += 1;
314+
};
315+
316+
if crate::xpath_functions::is_leaf(mathml) {
317+
return count;
318+
}
319+
320+
for child in mathml.children() {
321+
let child = as_element(child);
322+
count = add_ids_to_all(child, id_prefix, count);
323+
}
324+
return count;
325+
}
326+
}

src/infer_intent.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ mod tests {
611611

612612

613613
fn test_intent(mathml: &str, target: &str, intent_error_recovery: &str) -> bool {
614-
use crate::interface::*;
614+
use crate::{interface::*, element_util::*};
615615
// this forces initialization
616616
crate::interface::set_rules_dir(super::super::abs_rules_dir_path()).unwrap();
617617
// crate::speech::SpeechRules::initialize_all_rules().unwrap();

0 commit comments

Comments
 (0)