Skip to content

Commit 8f010d2

Browse files
refactor: reorganiza extração de XML com suporte a ISO-8859-1 (#1035)
* refactor: reorganiza extração de XML com suporte a ISO-8859-1 - Adiciona fallback para encoding ISO-8859-1 quando UTF-8 falha - Separa lógica em funções especializadas para XML e ZIP - Remove logs desnecessários e melhora tratamento de erros - Ignora arquivos ocultos em arquivos ZIP * Update packtools/sps/pid_provider/xml_sps_lib.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 5195561 commit 8f010d2

1 file changed

Lines changed: 113 additions & 51 deletions

File tree

packtools/sps/pid_provider/xml_sps_lib.py

Lines changed: 113 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class GetXmlWithPreFromURIError(Exception): ...
3939
class GetXMLItemsError(Exception): ...
4040

4141

42-
class GetXMLItemsFromZipFileError(Exception): ...
42+
class GetXMLWithPreFromZipFileError(Exception): ...
4343

4444

4545
class XMLWithPreArticlePublicationDateError(Exception): ...
@@ -64,30 +64,21 @@ def get_xml_items(xml_sps_file_path, filenames=None, capture_errors=None):
6464
try:
6565
name, ext = os.path.splitext(xml_sps_file_path)
6666
if ext == ".zip":
67-
return get_xml_items_from_zip_file(
67+
return get_xml_with_pre_from_zip_file(
6868
xml_sps_file_path, filenames, capture_errors
6969
)
7070
if ext == ".xml":
71-
with open(xml_sps_file_path) as fp:
72-
xml = get_xml_with_pre(fp.read())
73-
xml.file_path = xml_sps_file_path
74-
item = os.path.basename(xml_sps_file_path)
75-
return [
76-
{
77-
"filename": item,
78-
"xml_with_pre": xml,
79-
"files": [item],
80-
"filenames": [item],
81-
}
82-
]
71+
try:
72+
return get_xml_with_pre_from_xml_file(xml_sps_file_path, "utf-8")
73+
except GetXmlWithPreError as e:
74+
return get_xml_with_pre_from_xml_file(xml_sps_file_path, "iso-8859-1")
8375

8476
raise TypeError(
8577
_("{} must be xml file or zip file containing xml").format(
8678
xml_sps_file_path
8779
)
8880
)
8981
except Exception as e:
90-
LOGGER.exception(e)
9182
if capture_errors:
9283
return [
9384
{
@@ -103,7 +94,23 @@ def get_xml_items(xml_sps_file_path, filenames=None, capture_errors=None):
10394
)
10495

10596

106-
def get_xml_items_from_zip_file(
97+
def get_xml_with_pre_from_xml_file(xml_sps_file_path, encoding):
98+
with open(xml_sps_file_path, encoding=encoding) as fp:
99+
content = fp.read()
100+
xml = get_xml_with_pre(content)
101+
xml.file_path = xml_sps_file_path
102+
item = os.path.basename(xml_sps_file_path)
103+
return [
104+
{
105+
"filename": item,
106+
"xml_with_pre": xml,
107+
"files": [item],
108+
"filenames": [item],
109+
}
110+
]
111+
112+
113+
def get_xml_with_pre_from_zip_file(
107114
xml_sps_file_path, filenames=None, capture_errors=False
108115
):
109116
"""
@@ -127,55 +134,110 @@ def get_xml_items_from_zip_file(
127134
-----
128135
Yields errors as dicts instead of raising. Check for 'error' key.
129136
"""
130-
filenames = []
131-
basenames = []
132137
try:
133-
with ZipFile(xml_sps_file_path) as zf:
134-
zip_files = zf.namelist()
135-
check_files = filenames or zip_files
136-
xml_files = (f for f in check_files if f.endswith(".xml"))
138+
paths = []
139+
basenames = []
137140

138-
if not xml_files:
139-
raise TypeError(f"{xml_sps_file_path} has no XML files")
141+
zip_data = get_xml_items_from_zip_file(
142+
xml_sps_file_path, filenames,
143+
)
144+
xml_files = zip_data.get("xml_files")
145+
if not xml_files:
146+
raise TypeError(f"{xml_sps_file_path} has no XML files")
147+
148+
paths = zip_data.get("paths")
149+
basenames = zip_data.get("basenames")
150+
151+
for basename, xml_file in xml_files:
152+
try:
153+
response = {
154+
"filename": xml_file,
155+
"files": paths,
156+
"filenames": basenames,
157+
}
158+
xml_with_pre = get_xml_with_pre_from_zip_file_component(xml_sps_file_path, xml_file)
159+
xml_with_pre.zip_file_path = xml_sps_file_path
160+
response["xml_with_pre"] = xml_with_pre
161+
yield response
162+
except Exception as e:
163+
if not capture_errors:
164+
raise GetXMLWithPreFromZipFileError(f"Error in {xml_sps_file_path}/{xml_file}")
165+
response["error"] = str(e)
166+
response["type_error"] = type(e).__name__
167+
yield response
140168

141-
basenames = list(os.path.basename(n) for n in zip_files if n)
142-
for xml_file in xml_files:
143-
try:
144-
xml_with_pre = get_xml_with_pre(zf.read(xml_file).decode("utf-8"))
145-
xml_with_pre.zip_file_path = xml_sps_file_path
146-
yield {
147-
"filename": xml_file,
148-
"xml_with_pre": xml_with_pre,
149-
"files": check_files,
150-
"filenames": basenames,
151-
}
152-
except Exception as e:
153-
if not capture_errors:
154-
LOGGER.exception(f"Error in {xml_sps_file_path}/{xml_file}")
155-
156-
yield {
157-
"filename": xml_file,
158-
"files": check_files,
159-
"filenames": basenames,
160-
"error": str(e),
161-
"type_error": type(e).__name__,
162-
}
163169
except Exception as e:
164-
LOGGER.exception(e)
165170
if not capture_errors:
166-
raise GetXMLItemsFromZipFileError(
171+
raise GetXMLWithPreFromZipFileError(
167172
_("Unable to get xml items from zip file {}: {} {}").format(
168173
xml_sps_file_path, type(e).__name__, e
169174
)
170175
)
171176
yield {
172-
"files": filenames,
177+
"files": paths,
173178
"filenames": basenames,
174179
"error": str(e),
175180
"type_error": type(e).__name__,
176181
}
177182

178183

184+
def get_xml_items_from_zip_file(
185+
xml_sps_file_path, filenames=None,
186+
):
187+
"""
188+
Extract and process XML items from a ZIP file.
189+
190+
Parameters
191+
----------
192+
xml_sps_file_path : str
193+
Path to the ZIP file
194+
filenames : list of str, optional
195+
Specific files to process. If None, processes all files.
196+
197+
Yields
198+
------
199+
dict
200+
Success: {filename, xml_with_pre, files, filenames}
201+
XML error: {filename, files, filenames, error, type_error}
202+
ZIP error: {files, filenames, error, type_error}
203+
204+
Notes
205+
-----
206+
Yields errors as dicts instead of raising. Check for 'error' key.
207+
"""
208+
basenames = []
209+
zip_components = []
210+
xml_files = []
211+
with ZipFile(xml_sps_file_path) as zf:
212+
zip_components = zf.namelist()
213+
basenames = list(os.path.basename(n) for n in zip_components if n)
214+
215+
for item in zip_components:
216+
if not item.endswith(".xml"):
217+
continue
218+
219+
basename = os.path.basename(item)
220+
if basename.startswith("."):
221+
continue
222+
223+
if not filenames or basename in filenames:
224+
xml_files.append((basename, item))
225+
return {
226+
"basenames": basenames,
227+
"paths": zip_components,
228+
"xml_files": xml_files,
229+
}
230+
231+
232+
def get_xml_with_pre_from_zip_file_component(xml_sps_file_path, xml_file):
233+
with ZipFile(xml_sps_file_path) as zf:
234+
zf_read = zf.read(xml_file)
235+
try:
236+
return get_xml_with_pre(zf_read.decode("utf-8"))
237+
except Exception as e:
238+
return get_xml_with_pre(zf_read.decode("iso-8859-1"))
239+
240+
179241
def update_zip_file_xml(xml_sps_file_path, xml_file_path, content):
180242
"""
181243
Save XML content in a Zip file.
@@ -282,7 +344,7 @@ def get_xml_with_pre(xml_content):
282344
)
283345
try:
284346
return XMLWithPre(pref, etree.fromstring(xml))
285-
except etree.XMLSyntaxError:
347+
except etree.XMLSyntaxError as e:
286348
return XMLWithPre(pref, etree.fromstring(fix_pre_loading(xml)))
287349
except Exception as e:
288350
if xml_content:
@@ -320,7 +382,7 @@ def split_processing_instruction_doctype_declaration_and_xml(xml_content):
320382
if p >= 0:
321383
return xml_content[:p], xml_content[p:]
322384

323-
return "", xml_content
385+
return "", xml_content.strip()
324386

325387

326388
class XMLWithPre:

0 commit comments

Comments
 (0)