Strip attributes with undefined namespace prefixes from parsed HTML trees

Copilot · robertatakenaka · web-flow · commit 80d9c3364bf5 · 2026-04-28T19:59:21.000Z
Agent-Logs-Url: https://github.com/scieloorg/scielo_migration/sessions/807d21a0-f07d-4b7f-9102-fcce21174c55 Co-authored-by: robertatakenaka <505143+robertatakenaka@users.noreply.github.com>
diff --git a/scielo_classic_website/htmlbody/html_fixer.py b/scielo_classic_website/htmlbody/html_fixer.py
@@ -39,7 +39,9 @@ def get_best_choice_between_original_and_fixed(score, original, fixed_html, min_
 
 
 def load_html(content):
-    return fromstring(wrap_html(content))
+    tree = fromstring(wrap_html(content))
+    remove_invalid_namespace_attributes(tree)
+    return tree
 
 
 def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namespaces=True):
@@ -60,6 +62,7 @@ def get_fixed_html(content, style_mappings=None, tags_to_fix=None, remove_namesp
     fixed_content = fix(content, style_mappings, tags_to_fix)
     wrapped = wrap_html(fixed_content)
     tree = fromstring(wrapped)
+    remove_invalid_namespace_attributes(tree)
     return html2xml(tree)
 
 
@@ -366,6 +369,48 @@ def remove_invalid_xml_comments(html):
     return re.sub(r'<!--.*?-->', _filter_invalid_xml_comment, html, flags=re.DOTALL)
 
 
+_VALID_NAMESPACE_PREFIXES = frozenset({"xml", "xlink"})
+
+
+def remove_invalid_namespace_attributes(tree):
+    """
+    Remove atributos cujo nome contém prefixo de namespace não declarado.
+
+    HTML de origem ocasionalmente contém atributos malformados como
+    ``<a mailto:dade="...">``. O parser HTML do lxml mantém o nome literal
+    com dois pontos. Quando a árvore é serializada como XML e novamente
+    parseada, o lxml interpreta os dois pontos como separador de namespace
+    e levanta ``XMLSyntaxError`` ("Namespace prefix X for Y on Z is not
+    defined").
+
+    Esta função percorre a árvore e remove tais atributos. Os prefixos
+    padrão (``xml``, ``xlink``) são preservados; atributos já mapeados
+    em namespace pelo lxml (armazenados na notação Clark
+    ``{uri}localname``) também são preservados.
+    """
+    if tree is None:
+        return tree
+
+    elements = tree.iter() if hasattr(tree, "iter") else [tree]
+    for elem in elements:
+        attrib = getattr(elem, "attrib", None)
+        if not attrib:
+            continue
+        for attr_name in list(attrib.keys()):
+            if not isinstance(attr_name, str):
+                continue
+            # Atributos já mapeados em namespace ficam em notação Clark
+            if attr_name.startswith("{"):
+                continue
+            if ":" not in attr_name:
+                continue
+            prefix = attr_name.split(":", 1)[0]
+            if prefix in _VALID_NAMESPACE_PREFIXES:
+                continue
+            del attrib[attr_name]
+    return tree
+
+
 def remove_ms_office_conditionals(xml_str):
     """
     Remove blocos condicionais do MS Office que causam erros de parsing XML.
diff --git a/tests/test_html_fixer.py b/tests/test_html_fixer.py
@@ -1,8 +1,14 @@
 from unittest import TestCase
 
 from lxml import etree as ET
+from lxml import html as lxml_html
 
-from scielo_classic_website.htmlbody.html_fixer import remove_invalid_xml_comments
+from scielo_classic_website.htmlbody.html_fixer import (
+    get_fixed_html,
+    load_html,
+    remove_invalid_namespace_attributes,
+    remove_invalid_xml_comments,
+)
 
 
 class TestRemoveInvalidXmlComments(TestCase):
@@ -71,3 +77,65 @@ def test_multiline_invalid_comment(self):
         html = "<p>text</p><!--EndF>>\n<!--EndFragment--><p>more</p>"
         result = remove_invalid_xml_comments(html)
         self.assertEqual(result, "<p>text</p><p>more</p>")
+
+
+class TestRemoveInvalidNamespaceAttributes(TestCase):
+    def test_removes_undefined_namespace_attribute(self):
+        tree = lxml_html.fromstring(
+            '<html><body><a mailto:dade="x" href="y">link</a></body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        a = tree.find(".//a")
+        self.assertNotIn("mailto:dade", a.attrib)
+        self.assertEqual(a.get("href"), "y")
+
+    def test_serialized_tree_is_valid_xml(self):
+        tree = lxml_html.fromstring(
+            '<html><body><a mailto:dade="x" href="y">link</a></body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        serialized = ET.tostring(tree, method="xml").decode("utf-8")
+        # Re-parsing as XML must not raise XMLSyntaxError
+        ET.fromstring(serialized)
+
+    def test_preserves_xml_and_xlink_prefixes(self):
+        tree = lxml_html.fromstring(
+            '<html><body>'
+            '<a xml:lang="pt" xlink:href="x" mailto:foo="y">link</a>'
+            '</body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        a = tree.find(".//a")
+        self.assertEqual(a.get("xml:lang"), "pt")
+        self.assertEqual(a.get("xlink:href"), "x")
+        self.assertNotIn("mailto:foo", a.attrib)
+
+    def test_preserves_attributes_without_colon(self):
+        tree = lxml_html.fromstring(
+            '<html><body><p id="x" class="y">text</p></body></html>'
+        )
+        remove_invalid_namespace_attributes(tree)
+        p = tree.find(".//p")
+        self.assertEqual(p.get("id"), "x")
+        self.assertEqual(p.get("class"), "y")
+
+    def test_handles_none_tree(self):
+        self.assertIsNone(remove_invalid_namespace_attributes(None))
+
+    def test_load_html_strips_invalid_namespace_attributes(self):
+        tree = load_html('<p>foo <a mailto:dade="z" href="y">link</a> bar</p>')
+        a = tree.find(".//a")
+        self.assertNotIn("mailto:dade", a.attrib)
+        # Tree must serialize to valid XML
+        serialized = ET.tostring(tree, method="xml").decode("utf-8")
+        ET.fromstring(serialized)
+
+    def test_get_fixed_html_output_is_valid_xml(self):
+        # Attribute value contains '>' so the regex-based
+        # ``remove_namespaces_from_content`` step (used inside ``fix()``)
+        # cannot reliably strip the bad attribute. The tree-level cleanup
+        # must still produce XML that re-parses without errors.
+        content = '<p>Hello <a mailto:dade="a>b" href="x">world</a></p>'
+        result = get_fixed_html(content)
+        wrapped = f"<root>{result}</root>"
+        ET.fromstring(wrapped)