|
1 | 1 | from unittest import TestCase |
2 | 2 |
|
3 | 3 | from lxml import etree as ET |
| 4 | +from lxml import html as lxml_html |
4 | 5 |
|
5 | | -from scielo_classic_website.htmlbody.html_fixer import remove_invalid_xml_comments |
| 6 | +from scielo_classic_website.htmlbody.html_fixer import ( |
| 7 | + get_fixed_html, |
| 8 | + load_html, |
| 9 | + remove_invalid_namespace_attributes, |
| 10 | + remove_invalid_xml_comments, |
| 11 | +) |
6 | 12 |
|
7 | 13 |
|
8 | 14 | class TestRemoveInvalidXmlComments(TestCase): |
@@ -71,3 +77,65 @@ def test_multiline_invalid_comment(self): |
71 | 77 | html = "<p>text</p><!--EndF>>\n<!--EndFragment--><p>more</p>" |
72 | 78 | result = remove_invalid_xml_comments(html) |
73 | 79 | self.assertEqual(result, "<p>text</p><p>more</p>") |
| 80 | + |
| 81 | + |
| 82 | +class TestRemoveInvalidNamespaceAttributes(TestCase): |
| 83 | + def test_removes_undefined_namespace_attribute(self): |
| 84 | + tree = lxml_html.fromstring( |
| 85 | + '<html><body><a mailto:dade="x" href="y">link</a></body></html>' |
| 86 | + ) |
| 87 | + remove_invalid_namespace_attributes(tree) |
| 88 | + a = tree.find(".//a") |
| 89 | + self.assertNotIn("mailto:dade", a.attrib) |
| 90 | + self.assertEqual(a.get("href"), "y") |
| 91 | + |
| 92 | + def test_serialized_tree_is_valid_xml(self): |
| 93 | + tree = lxml_html.fromstring( |
| 94 | + '<html><body><a mailto:dade="x" href="y">link</a></body></html>' |
| 95 | + ) |
| 96 | + remove_invalid_namespace_attributes(tree) |
| 97 | + serialized = ET.tostring(tree, method="xml").decode("utf-8") |
| 98 | + # Re-parsing as XML must not raise XMLSyntaxError |
| 99 | + ET.fromstring(serialized) |
| 100 | + |
| 101 | + def test_preserves_xml_and_xlink_prefixes(self): |
| 102 | + tree = lxml_html.fromstring( |
| 103 | + '<html><body>' |
| 104 | + '<a xml:lang="pt" xlink:href="x" mailto:foo="y">link</a>' |
| 105 | + '</body></html>' |
| 106 | + ) |
| 107 | + remove_invalid_namespace_attributes(tree) |
| 108 | + a = tree.find(".//a") |
| 109 | + self.assertEqual(a.get("xml:lang"), "pt") |
| 110 | + self.assertEqual(a.get("xlink:href"), "x") |
| 111 | + self.assertNotIn("mailto:foo", a.attrib) |
| 112 | + |
| 113 | + def test_preserves_attributes_without_colon(self): |
| 114 | + tree = lxml_html.fromstring( |
| 115 | + '<html><body><p id="x" class="y">text</p></body></html>' |
| 116 | + ) |
| 117 | + remove_invalid_namespace_attributes(tree) |
| 118 | + p = tree.find(".//p") |
| 119 | + self.assertEqual(p.get("id"), "x") |
| 120 | + self.assertEqual(p.get("class"), "y") |
| 121 | + |
| 122 | + def test_handles_none_tree(self): |
| 123 | + self.assertIsNone(remove_invalid_namespace_attributes(None)) |
| 124 | + |
| 125 | + def test_load_html_strips_invalid_namespace_attributes(self): |
| 126 | + tree = load_html('<p>foo <a mailto:dade="z" href="y">link</a> bar</p>') |
| 127 | + a = tree.find(".//a") |
| 128 | + self.assertNotIn("mailto:dade", a.attrib) |
| 129 | + # Tree must serialize to valid XML |
| 130 | + serialized = ET.tostring(tree, method="xml").decode("utf-8") |
| 131 | + ET.fromstring(serialized) |
| 132 | + |
| 133 | + def test_get_fixed_html_output_is_valid_xml(self): |
| 134 | + # Attribute value contains '>' so the regex-based |
| 135 | + # ``remove_namespaces_from_content`` step (used inside ``fix()``) |
| 136 | + # cannot reliably strip the bad attribute. The tree-level cleanup |
| 137 | + # must still produce XML that re-parses without errors. |
| 138 | + content = '<p>Hello <a mailto:dade="a>b" href="x">world</a></p>' |
| 139 | + result = get_fixed_html(content) |
| 140 | + wrapped = f"<root>{result}</root>" |
| 141 | + ET.fromstring(wrapped) |
0 commit comments