Skip to content

Commit 24f4353

Browse files
Corrige leitura de xml considerando entidades (#1026)
* Criar um dicionário para lidar com entidades que são conversíveis por biblioteca * Adiciona funções para ler xml e tratar as entidades * Uso da função fix_entities em get_xml_with_pre
1 parent ced0536 commit 24f4353

3 files changed

Lines changed: 541 additions & 1 deletion

File tree

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import html
2+
import logging
3+
from lxml import etree
4+
from bs4 import BeautifulSoup
5+
from packtools.sps.pid_provider.name2number import NAME_TO_NUMBER_ENTITIES
6+
7+
8+
def fix_entities(xml):
9+
return format_output(html_parser_ent2char(xml))
10+
11+
12+
def discover_entities_to_fix_in_output(bkp):
13+
bkp = bkp.replace("&amp;", "<ISOLAENTIDADEXML>&")
14+
bkp = bkp.replace(";", ";<ISOLAENTIDADEXML>")
15+
16+
for item in bkp.split("<ISOLAENTIDADEXML>"):
17+
if not item.strip():
18+
continue
19+
if " " in item:
20+
continue
21+
if item[0] == "&" and item[-1] == ";":
22+
yield item.replace("&", "&amp;")
23+
24+
25+
def format_output(xml):
26+
if "&" not in xml:
27+
return xml
28+
29+
entities = set(discover_entities_to_fix_in_output(xml))
30+
if not entities:
31+
return xml
32+
33+
for ent in entities:
34+
xml = xml.replace(ent, NAME_TO_NUMBER_ENTITIES.get(ent) or ent)
35+
return xml
36+
37+
38+
def xml_parser_ent2char(xml):
39+
try:
40+
parser = etree.XMLParser(recover=True, encoding="utf-8")
41+
root = etree.fromstring(xml, parser)
42+
return etree.tostring(root, method="xml", encoding="utf-8").decode("utf-8")
43+
except Exception as e:
44+
logging.info("opção 1")
45+
logging.exception(e)
46+
47+
48+
def html_unescape_ent2char(xml):
49+
try:
50+
xml = html.unescape(xml)
51+
root = etree.fromstring(xml)
52+
return etree.tostring(root, method="xml", encoding="utf-8").decode("utf-8")
53+
except Exception as e:
54+
logging.info("opção 2")
55+
logging.exception(e)
56+
57+
58+
def html_parser_ent2char(xml):
59+
try:
60+
parser = etree.HTMLParser()
61+
root = etree.fromstring(xml, parser)
62+
return etree.tostring(root.find(".").find("body").find("*"), method="xml", encoding="utf-8").decode("utf-8")
63+
except Exception as e:
64+
logging.info("opção 3")
65+
logging.exception(e)
66+
67+
68+
def bs_ent2char_(xml):
69+
parsers = [
70+
("xml", "Alias para lxml-xml"),
71+
("lxml", "Parser HTML com lxml, rápido"),
72+
("html.parser", "Parser HTML built-in do Python"),
73+
("html5lib", "Parser HTML5 mais compatível"), # Precisa instalar
74+
]
75+
for parser, description in parsers:
76+
print(f"\n---\n{parser}")
77+
soup_xml = BeautifulSoup(xml, parser)
78+
yield str(soup_xml)
79+
80+
81+
def bs_ent2char(xml):
82+
soup_xml = BeautifulSoup(xml, "lxml")
83+
return str(soup_xml)
84+
85+
86+
def main():
87+
xml = """<document>
88+
<title>Exemplo com Entidades</title>
89+
<content>&rsquo;&iacute;
90+
<paragraph>&ldquo;Quotes&rdquo; e &lquo;apostrophes&rquo;</paragraph>
91+
<special>&mdash; travessão &nbsp; espaço &copy;2024</special>
92+
<price>&euro;100 ou &pound;80</price>
93+
<math>&frac12; &times; 2 = 1</math>
94+
<nested>
95+
<item id="1">Primeiro &rquo;item&lquo;</item>
96+
<item id="2">Segundo &mdash; item</item>
97+
</nested>
98+
<p>mdash : &mdash;</p>
99+
<p>180 : &#180;</p>
100+
<p>rquo : &rquo;<break/>191 : &#191; | &#x02019;</p>
101+
<p>187 : &#187;</p>
102+
</content>
103+
</document>"""
104+
105+
print("\n---\nEntrada")
106+
print(xml)
107+
108+
print("\n---\nxml_parser_ent2char")
109+
print(xml_parser_ent2char(xml))
110+
111+
print("\n---\nhtml_unescape_ent2char")
112+
print(html_unescape_ent2char(xml))
113+
114+
print("\n---\nhtml_parser_ent2char")
115+
print(html_parser_ent2char(xml))
116+
117+
print("\n---\nbs_ent2char")
118+
print(bs_ent2char(xml))
119+
120+
for item in bs_ent2char_(xml):
121+
print("")
122+
print(item)
123+
124+
print("\n---\nfix_entities")
125+
print(fix_entities(xml))
126+
127+
128+
if __name__ == "__main__":
129+
main()

0 commit comments

Comments
 (0)