|
| 1 | +use crate::canonicalize::{as_element, name}; |
| 2 | +use crate::errors::*; |
| 3 | +use crate::logs::enable_logs; |
| 4 | +use crate::xpath_functions::{is_leaf, IsNode}; |
| 5 | +use lazy_static::lazy_static; |
| 6 | +use regex::Regex; |
| 7 | +use sxd_document::{Package}; |
| 8 | +use sxd_document::dom::*; |
| 9 | + |
| 10 | +pub fn get_element(package: &Package) -> Element { |
| 11 | + enable_logs(); |
| 12 | + let doc = package.as_document(); |
| 13 | + let mut result = None; |
| 14 | + for root_child in doc.root().children() { |
| 15 | + if let ChildOfRoot::Element(e) = root_child { |
| 16 | + assert!(result.is_none()); |
| 17 | + result = Some(e); |
| 18 | + } |
| 19 | + } |
| 20 | + return result.unwrap(); |
| 21 | +} |
| 22 | + |
| 23 | +/// returns Ok() if two Documents are equal or some info where they differ in the Err |
| 24 | +// Not really meant to be public -- used by tests in some packages |
| 25 | +#[allow(dead_code)] |
| 26 | +pub fn is_same_element(e1: Element, e2: Element) -> Result<()> { |
| 27 | + enable_logs(); |
| 28 | + if name(e1) != name(e2) { |
| 29 | + bail!("Names not the same: {}, {}", name(e1), name(e2)); |
| 30 | + } |
| 31 | + |
| 32 | + // assume 'e' doesn't have element children until proven otherwise |
| 33 | + // this means we keep Text children until we are proven they aren't needed |
| 34 | + if e1.children().len() != e2.children().len() { |
| 35 | + bail!( |
| 36 | + "Children of {} have {} != {} children", |
| 37 | + name(e1), |
| 38 | + e1.children().len(), |
| 39 | + e2.children().len() |
| 40 | + ); |
| 41 | + } |
| 42 | + |
| 43 | + if let Err(e) = attrs_are_same(e1.attributes(), e2.attributes()) { |
| 44 | + bail!("In element {}, {}", name(e1), e); |
| 45 | + } |
| 46 | + |
| 47 | + for (i, (c1, c2)) in e1.children().iter().zip(e2.children().iter()).enumerate() { |
| 48 | + match c1 { |
| 49 | + ChildOfElement::Element(child1) => { |
| 50 | + if let ChildOfElement::Element(child2) = c2 { |
| 51 | + is_same_element(*child1, *child2)?; |
| 52 | + } else { |
| 53 | + bail!("{} child #{}, first is element, second is something else", name(e1), i); |
| 54 | + } |
| 55 | + } |
| 56 | + ChildOfElement::Comment(com1) => { |
| 57 | + if let ChildOfElement::Comment(com2) = c2 { |
| 58 | + if com1.text() != com2.text() { |
| 59 | + bail!("{} child #{} -- comment text differs", name(e1), i); |
| 60 | + } |
| 61 | + } else { |
| 62 | + bail!("{} child #{}, first is comment, second is something else", name(e1), i); |
| 63 | + } |
| 64 | + } |
| 65 | + ChildOfElement::ProcessingInstruction(p1) => { |
| 66 | + if let ChildOfElement::ProcessingInstruction(p2) = c2 { |
| 67 | + if p1.target() != p2.target() || p1.value() != p2.value() { |
| 68 | + bail!("{} child #{} -- processing instruction differs", name(e1), i); |
| 69 | + } |
| 70 | + } else { |
| 71 | + bail!( |
| 72 | + "{} child #{}, first is processing instruction, second is something else", |
| 73 | + name(e1), |
| 74 | + i |
| 75 | + ); |
| 76 | + } |
| 77 | + } |
| 78 | + ChildOfElement::Text(t1) => { |
| 79 | + if let ChildOfElement::Text(t2) = c2 { |
| 80 | + if t1.text() != t2.text() { |
| 81 | + bail!("{} child #{} -- text differs", name(e1), i); |
| 82 | + } |
| 83 | + } else { |
| 84 | + bail!("{} child #{}, first is text, second is something else", name(e1), i); |
| 85 | + } |
| 86 | + } |
| 87 | + } |
| 88 | + } |
| 89 | + return Ok(()); |
| 90 | + |
| 91 | + /// compares attributes -- '==' didn't seems to work |
| 92 | + fn attrs_are_same(attrs1: Vec<Attribute>, attrs2: Vec<Attribute>) -> Result<()> { |
| 93 | + if attrs1.len() != attrs2.len() { |
| 94 | + bail!("Attributes have different length: {:?} != {:?}", attrs1, attrs2); |
| 95 | + } |
| 96 | + // can't guarantee attrs are in the same order |
| 97 | + for attr1 in attrs1 { |
| 98 | + if let Some(found_attr2) = attrs2 |
| 99 | + .iter() |
| 100 | + .find(|&attr2| attr1.name().local_part() == attr2.name().local_part()) |
| 101 | + { |
| 102 | + if attr1.value() == found_attr2.value() { |
| 103 | + continue; |
| 104 | + } else { |
| 105 | + bail!( |
| 106 | + "Attribute named {} has differing values:\n '{}'\n '{}'", |
| 107 | + attr1.name().local_part(), |
| 108 | + attr1.value(), |
| 109 | + found_attr2.value() |
| 110 | + ); |
| 111 | + } |
| 112 | + } else { |
| 113 | + bail!( |
| 114 | + "Attribute name {} not in [{}]", |
| 115 | + print_attr(&attr1), |
| 116 | + print_attrs(&attrs2) |
| 117 | + ); |
| 118 | + } |
| 119 | + } |
| 120 | + return Ok(()); |
| 121 | + |
| 122 | + fn print_attr(attr: &Attribute) -> String { |
| 123 | + return format!("@{}='{}'", attr.name().local_part(), attr.value()); |
| 124 | + } |
| 125 | + fn print_attrs(attrs: &[Attribute]) -> String { |
| 126 | + return attrs.iter().map(print_attr).collect::<Vec<String>>().join(", "); |
| 127 | + } |
| 128 | + } |
| 129 | +} |
| 130 | + |
| 131 | +// used for testing trim |
| 132 | +/// returns Ok() if two Documents are equal or some info where they differ in the Err |
| 133 | +#[allow(dead_code)] |
| 134 | +pub(crate) fn is_same_doc(doc1: &Document, doc2: &Document) -> Result<()> { |
| 135 | + // assume 'e' doesn't have element children until proven otherwise |
| 136 | + // this means we keep Text children until we are proven they aren't needed |
| 137 | + if doc1.root().children().len() != doc2.root().children().len() { |
| 138 | + bail!( |
| 139 | + "Children of docs have {} != {} children", |
| 140 | + doc1.root().children().len(), |
| 141 | + doc2.root().children().len() |
| 142 | + ); |
| 143 | + } |
| 144 | + |
| 145 | + for (i, (c1, c2)) in doc1 |
| 146 | + .root() |
| 147 | + .children() |
| 148 | + .iter() |
| 149 | + .zip(doc2.root().children().iter()) |
| 150 | + .enumerate() |
| 151 | + { |
| 152 | + match c1 { |
| 153 | + ChildOfRoot::Element(e1) => { |
| 154 | + if let ChildOfRoot::Element(e2) = c2 { |
| 155 | + is_same_element(*e1, *e2)?; |
| 156 | + } else { |
| 157 | + bail!("child #{}, first is element, second is something else", i); |
| 158 | + } |
| 159 | + } |
| 160 | + ChildOfRoot::Comment(com1) => { |
| 161 | + if let ChildOfRoot::Comment(com2) = c2 { |
| 162 | + if com1.text() != com2.text() { |
| 163 | + bail!("child #{} -- comment text differs", i); |
| 164 | + } |
| 165 | + } else { |
| 166 | + bail!("child #{}, first is comment, second is something else", i); |
| 167 | + } |
| 168 | + } |
| 169 | + ChildOfRoot::ProcessingInstruction(p1) => { |
| 170 | + if let ChildOfRoot::ProcessingInstruction(p2) = c2 { |
| 171 | + if p1.target() != p2.target() || p1.value() != p2.value() { |
| 172 | + bail!("child #{} -- processing instruction differs", i); |
| 173 | + } |
| 174 | + } else { |
| 175 | + bail!( |
| 176 | + "child #{}, first is processing instruction, second is something else", |
| 177 | + i |
| 178 | + ); |
| 179 | + } |
| 180 | + } |
| 181 | + } |
| 182 | + } |
| 183 | + return Ok(()); |
| 184 | +} |
| 185 | + |
| 186 | +/// Not really meant to be public -- used by tests in some packages |
| 187 | +pub fn trim_element(e: Element, allow_structure_in_leaves: bool) { |
| 188 | + // "<mtext>this is text</mtext" results in 3 text children |
| 189 | + // these are combined into one child as it makes code downstream simpler |
| 190 | + |
| 191 | + // space, tab, newline, carriage return all get collapsed to a single space |
| 192 | + const WHITESPACE: &[char] = &[' ', '\u{0009}', '\u{000A}', '\u{000D}']; |
| 193 | + lazy_static! { |
| 194 | + static ref WHITESPACE_MATCH: Regex = Regex::new(r#"[ \u{0009}\u{000A}\u{000D}]+"#).unwrap(); |
| 195 | + } |
| 196 | + |
| 197 | + if is_leaf(e) && (!allow_structure_in_leaves || IsNode::is_mathml(e)) { |
| 198 | + // Assume it is HTML inside of the leaf -- turn the HTML into a string |
| 199 | + make_leaf_element(e); |
| 200 | + return; |
| 201 | + } |
| 202 | + |
| 203 | + let mut single_text = "".to_string(); |
| 204 | + for child in e.children() { |
| 205 | + match child { |
| 206 | + ChildOfElement::Element(c) => { |
| 207 | + trim_element(c, allow_structure_in_leaves); |
| 208 | + } |
| 209 | + ChildOfElement::Text(t) => { |
| 210 | + single_text += t.text(); |
| 211 | + e.remove_child(child); |
| 212 | + } |
| 213 | + _ => { |
| 214 | + e.remove_child(child); |
| 215 | + } |
| 216 | + } |
| 217 | + } |
| 218 | + |
| 219 | + // CSS considers only space, tab, linefeed, and carriage return as collapsable whitespace |
| 220 | + if !(is_leaf(e) || name(e) == "intent-literal" || single_text.is_empty()) { |
| 221 | + // intent-literal comes from testing intent |
| 222 | + // FIX: we have a problem -- what should happen??? |
| 223 | + // FIX: For now, just keep the children and ignore the text and log an error -- shouldn't panic/crash |
| 224 | + if !single_text.trim_matches(WHITESPACE).is_empty() { |
| 225 | + error!( |
| 226 | + "trim_element: both element and textual children which shouldn't happen -- ignoring text '{}'", |
| 227 | + single_text |
| 228 | + ); |
| 229 | + } |
| 230 | + return; |
| 231 | + } |
| 232 | + if e.children().is_empty() && !single_text.is_empty() { |
| 233 | + // debug!("Combining text in {}: '{}' -> '{}'", e.name().local_part(), single_text, trimmed_text); |
| 234 | + e.set_text(&WHITESPACE_MATCH.replace_all(&single_text, " ")); |
| 235 | + } |
| 236 | + |
| 237 | + fn make_leaf_element(mathml_leaf: Element) { |
| 238 | + // MathML leaves like <mn> really shouldn't have non-textual content, but you could have embedded HTML |
| 239 | + // Here, we take convert them to leaves by grabbing up all the text and making that the content |
| 240 | + // Potentially, we leave them and let (default) rules do something, but it makes other parts of the code |
| 241 | + // messier because checking the text of a leaf becomes Option<&str> rather than just &str |
| 242 | + let children = mathml_leaf.children(); |
| 243 | + if children.is_empty() { |
| 244 | + return; |
| 245 | + } |
| 246 | + |
| 247 | + // gather up the text |
| 248 | + let mut text = "".to_string(); |
| 249 | + for child in children { |
| 250 | + let child_text = match child { |
| 251 | + ChildOfElement::Element(child) => { |
| 252 | + if name(child) == "mglyph" { |
| 253 | + child.attribute_value("alt").unwrap_or("").to_string() |
| 254 | + } else { |
| 255 | + gather_text(child) |
| 256 | + } |
| 257 | + } |
| 258 | + ChildOfElement::Text(t) => { |
| 259 | + // debug!("ChildOfElement::Text: '{}'", t.text()); |
| 260 | + t.text().to_string() |
| 261 | + } |
| 262 | + _ => "".to_string(), |
| 263 | + }; |
| 264 | + if !child_text.is_empty() { |
| 265 | + text += &child_text; |
| 266 | + } |
| 267 | + } |
| 268 | + |
| 269 | + // get rid of the old children and replace with the text we just built |
| 270 | + mathml_leaf.clear_children(); |
| 271 | + mathml_leaf.set_text(WHITESPACE_MATCH.replace_all(&text, " ").trim_matches(WHITESPACE)); |
| 272 | + // debug!("make_leaf_element: text is '{}'", crate::canonicalize::as_text(mathml_leaf)); |
| 273 | + |
| 274 | + /// gather up all the contents of the element and return them with a leading space |
| 275 | + fn gather_text(html: Element) -> String { |
| 276 | + let mut text = "".to_string(); // since we are throwing out the element tag, add a space between the contents |
| 277 | + for child in html.children() { |
| 278 | + match child { |
| 279 | + ChildOfElement::Element(child) => { |
| 280 | + text += &gather_text(child); |
| 281 | + } |
| 282 | + ChildOfElement::Text(t) => text += t.text(), |
| 283 | + _ => (), |
| 284 | + } |
| 285 | + } |
| 286 | + // debug!("gather_text: '{}'", text); |
| 287 | + return text; |
| 288 | + } |
| 289 | + } |
| 290 | +} |
| 291 | + |
| 292 | +pub(crate) fn add_ids(mathml: Element) -> Element { |
| 293 | + use std::time::SystemTime; |
| 294 | + let time = if cfg!(target_family = "wasm") { |
| 295 | + fastrand::usize(..) |
| 296 | + } else { |
| 297 | + SystemTime::now() |
| 298 | + .duration_since(SystemTime::UNIX_EPOCH) |
| 299 | + .unwrap() |
| 300 | + .as_millis() as usize |
| 301 | + }; |
| 302 | + let time_part = radix_fmt::radix(time, 36).to_string(); |
| 303 | + let random_part = radix_fmt::radix(fastrand::u32(..), 36).to_string(); |
| 304 | + let prefix = "M".to_string() + &time_part[time_part.len() - 3..] + &random_part[random_part.len() - 4..] + "-"; // begin with letter |
| 305 | + add_ids_to_all(mathml, &prefix, 0); |
| 306 | + return mathml; |
| 307 | + |
| 308 | + fn add_ids_to_all(mathml: Element, id_prefix: &str, count: usize) -> usize { |
| 309 | + let mut count = count; |
| 310 | + if mathml.attribute("id").is_none() { |
| 311 | + mathml.set_attribute_value("id", (id_prefix.to_string() + &count.to_string()).as_str()); |
| 312 | + mathml.set_attribute_value("data-id-added", "true"); |
| 313 | + count += 1; |
| 314 | + }; |
| 315 | + |
| 316 | + if crate::xpath_functions::is_leaf(mathml) { |
| 317 | + return count; |
| 318 | + } |
| 319 | + |
| 320 | + for child in mathml.children() { |
| 321 | + let child = as_element(child); |
| 322 | + count = add_ids_to_all(child, id_prefix, count); |
| 323 | + } |
| 324 | + return count; |
| 325 | + } |
| 326 | +} |
0 commit comments