Skip to content

Commit 80d9c33

Browse files
Strip attributes with undefined namespace prefixes from parsed HTML trees
Agent-Logs-Url: https://github.com/scieloorg/scielo_migration/sessions/807d21a0-f07d-4b7f-9102-fcce21174c55 Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com>
1 parent 8bb3234 commit 80d9c33

2 files changed

Lines changed: 115 additions & 2 deletions

File tree

scielo_classic_website/htmlbody/html_fixer.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_
3939

4040

4141
def load_html(content):
42-
return fromstring(wrap_html(content))
42+
tree = fromstring(wrap_html(content))
43+
remove_invalid_namespace_attributes(tree)
44+
return tree
4345

4446

4547
def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namespaces=True):
@@ -60,6 +62,7 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp
6062
fixed_content = fix(content, style_mappings, tags_to_fix)
6163
wrapped = wrap_html(fixed_content)
6264
tree = fromstring(wrapped)
65+
remove_invalid_namespace_attributes(tree)
6366
return html2xml(tree)
6467

6568

@@ -366,6 +369,48 @@ def remove_invalid_xml_comments(html):
366369
return re.sub(r'<!--.*?-->', _filter_invalid_xml_comment, html, flags=re.DOTALL)
367370

368371

372+
_VALID_NAMESPACE_PREFIXES = frozenset({"xml", "xlink"})
373+
374+
375+
def remove_invalid_namespace_attributes(tree):
376+
"""
377+
Remove atributos cujo nome contém prefixo de namespace não declarado.
378+
379+
HTML de origem ocasionalmente contém atributos malformados como
380+
``<a mailto:dade="...">``. O parser HTML do lxml mantém o nome literal
381+
com dois pontos. Quando a árvore é serializada como XML e novamente
382+
parseada, o lxml interpreta os dois pontos como separador de namespace
383+
e levanta ``XMLSyntaxError`` ("Namespace prefix X for Y on Z is not
384+
defined").
385+
386+
Esta função percorre a árvore e remove tais atributos. Os prefixos
387+
padrão (``xml``, ``xlink``) são preservados; atributos já mapeados
388+
em namespace pelo lxml (armazenados na notação Clark
389+
``{uri}localname``) também são preservados.
390+
"""
391+
if tree is None:
392+
return tree
393+
394+
elements = tree.iter() if hasattr(tree, "iter") else [tree]
395+
for elem in elements:
396+
attrib = getattr(elem, "attrib", None)
397+
if not attrib:
398+
continue
399+
for attr_name in list(attrib.keys()):
400+
if not isinstance(attr_name, str):
401+
continue
402+
# Atributos já mapeados em namespace ficam em notação Clark
403+
if attr_name.startswith("{"):
404+
continue
405+
if ":" not in attr_name:
406+
continue
407+
prefix = attr_name.split(":", 1)[0]
408+
if prefix in _VALID_NAMESPACE_PREFIXES:
409+
continue
410+
del attrib[attr_name]
411+
return tree
412+
413+
369414
def remove_ms_office_conditionals(xml_str):
370415
"""
371416
Remove blocos condicionais do MS Office que causam erros de parsing XML.

tests/test_html_fixer.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
from unittest import TestCase
22

33
from lxml import etree as ET
4+
from lxml import html as lxml_html
45

5-
from scielo_classic_website.htmlbody.html_fixer import remove_invalid_xml_comments
6+
from scielo_classic_website.htmlbody.html_fixer import (
7+
get_fixed_html,
8+
load_html,
9+
remove_invalid_namespace_attributes,
10+
remove_invalid_xml_comments,
11+
)
612

713

814
class TestRemoveInvalidXmlComments(TestCase):
@@ -71,3 +77,65 @@ def test_multiline_invalid_comment(self):
7177
html = "<p>text</p><!--EndF>>\n<!--EndFragment--><p>more</p>"
7278
result = remove_invalid_xml_comments(html)
7379
self.assertEqual(result, "<p>text</p><p>more</p>")
80+
81+
82+
class TestRemoveInvalidNamespaceAttributes(TestCase):
83+
def test_removes_undefined_namespace_attribute(self):
84+
tree = lxml_html.fromstring(
85+
'<html><body><a mailto:dade="x" href="y">link</a></body></html>'
86+
)
87+
remove_invalid_namespace_attributes(tree)
88+
a = tree.find(".//a")
89+
self.assertNotIn("mailto:dade", a.attrib)
90+
self.assertEqual(a.get("href"), "y")
91+
92+
def test_serialized_tree_is_valid_xml(self):
93+
tree = lxml_html.fromstring(
94+
'<html><body><a mailto:dade="x" href="y">link</a></body></html>'
95+
)
96+
remove_invalid_namespace_attributes(tree)
97+
serialized = ET.tostring(tree, method="xml").decode("utf-8")
98+
# Re-parsing as XML must not raise XMLSyntaxError
99+
ET.fromstring(serialized)
100+
101+
def test_preserves_xml_and_xlink_prefixes(self):
102+
tree = lxml_html.fromstring(
103+
'<html><body>'
104+
'<a xml:lang="pt" xlink:href="x" mailto:foo="y">link</a>'
105+
'</body></html>'
106+
)
107+
remove_invalid_namespace_attributes(tree)
108+
a = tree.find(".//a")
109+
self.assertEqual(a.get("xml:lang"), "pt")
110+
self.assertEqual(a.get("xlink:href"), "x")
111+
self.assertNotIn("mailto:foo", a.attrib)
112+
113+
def test_preserves_attributes_without_colon(self):
114+
tree = lxml_html.fromstring(
115+
'<html><body><p id="x" class="y">text</p></body></html>'
116+
)
117+
remove_invalid_namespace_attributes(tree)
118+
p = tree.find(".//p")
119+
self.assertEqual(p.get("id"), "x")
120+
self.assertEqual(p.get("class"), "y")
121+
122+
def test_handles_none_tree(self):
123+
self.assertIsNone(remove_invalid_namespace_attributes(None))
124+
125+
def test_load_html_strips_invalid_namespace_attributes(self):
126+
tree = load_html('<p>foo <a mailto:dade="z" href="y">link</a> bar</p>')
127+
a = tree.find(".//a")
128+
self.assertNotIn("mailto:dade", a.attrib)
129+
# Tree must serialize to valid XML
130+
serialized = ET.tostring(tree, method="xml").decode("utf-8")
131+
ET.fromstring(serialized)
132+
133+
def test_get_fixed_html_output_is_valid_xml(self):
134+
# Attribute value contains '>' so the regex-based
135+
# ``remove_namespaces_from_content`` step (used inside ``fix()``)
136+
# cannot reliably strip the bad attribute. The tree-level cleanup
137+
# must still produce XML that re-parses without errors.
138+
content = '<p>Hello <a mailto:dade="a>b" href="x">world</a></p>'
139+
result = get_fixed_html(content)
140+
wrapped = f"<root>{result}</root>"
141+
ET.fromstring(wrapped)

0 commit comments

Comments
 (0)