@@ -39,7 +39,7 @@ class GetXmlWithPreFromURIError(Exception): ...
3939class GetXMLItemsError (Exception ): ...
4040
4141
42- class GetXMLItemsFromZipFileError (Exception ): ...
42+ class GetXMLWithPreFromZipFileError (Exception ): ...
4343
4444
4545class XMLWithPreArticlePublicationDateError (Exception ): ...
@@ -64,30 +64,21 @@ def get_xml_items(xml_sps_file_path, filenames=None, capture_errors=None):
6464 try :
6565 name , ext = os .path .splitext (xml_sps_file_path )
6666 if ext == ".zip" :
67- return get_xml_items_from_zip_file (
67+ return get_xml_with_pre_from_zip_file (
6868 xml_sps_file_path , filenames , capture_errors
6969 )
7070 if ext == ".xml" :
71- with open (xml_sps_file_path ) as fp :
72- xml = get_xml_with_pre (fp .read ())
73- xml .file_path = xml_sps_file_path
74- item = os .path .basename (xml_sps_file_path )
75- return [
76- {
77- "filename" : item ,
78- "xml_with_pre" : xml ,
79- "files" : [item ],
80- "filenames" : [item ],
81- }
82- ]
71+ try :
72+ return get_xml_with_pre_from_xml_file (xml_sps_file_path , "utf-8" )
73+ except GetXmlWithPreError as e :
74+ return get_xml_with_pre_from_xml_file (xml_sps_file_path , "iso-8859-1" )
8375
8476 raise TypeError (
8577 _ ("{} must be xml file or zip file containing xml" ).format (
8678 xml_sps_file_path
8779 )
8880 )
8981 except Exception as e :
90- LOGGER .exception (e )
9182 if capture_errors :
9283 return [
9384 {
@@ -103,7 +94,23 @@ def get_xml_items(xml_sps_file_path, filenames=None, capture_errors=None):
10394 )
10495
10596
106- def get_xml_items_from_zip_file (
97+ def get_xml_with_pre_from_xml_file (xml_sps_file_path , encoding ):
98+ with open (xml_sps_file_path , encoding = encoding ) as fp :
99+ content = fp .read ()
100+ xml = get_xml_with_pre (content )
101+ xml .file_path = xml_sps_file_path
102+ item = os .path .basename (xml_sps_file_path )
103+ return [
104+ {
105+ "filename" : item ,
106+ "xml_with_pre" : xml ,
107+ "files" : [item ],
108+ "filenames" : [item ],
109+ }
110+ ]
111+
112+
113+ def get_xml_with_pre_from_zip_file (
107114 xml_sps_file_path , filenames = None , capture_errors = False
108115):
109116 """
@@ -127,55 +134,110 @@ def get_xml_items_from_zip_file(
127134 -----
128135 Yields errors as dicts instead of raising. Check for 'error' key.
129136 """
130- filenames = []
131- basenames = []
132137 try :
133- with ZipFile (xml_sps_file_path ) as zf :
134- zip_files = zf .namelist ()
135- check_files = filenames or zip_files
136- xml_files = (f for f in check_files if f .endswith (".xml" ))
138+ paths = []
139+ basenames = []
137140
138- if not xml_files :
139- raise TypeError (f"{ xml_sps_file_path } has no XML files" )
141+ zip_data = get_xml_items_from_zip_file (
142+ xml_sps_file_path , filenames ,
143+ )
144+ xml_files = zip_data .get ("xml_files" )
145+ if not xml_files :
146+ raise TypeError (f"{ xml_sps_file_path } has no XML files" )
147+
148+ paths = zip_data .get ("paths" )
149+ basenames = zip_data .get ("basenames" )
150+
151+ for basename , xml_file in xml_files :
152+ try :
153+ response = {
154+ "filename" : xml_file ,
155+ "files" : paths ,
156+ "filenames" : basenames ,
157+ }
158+ xml_with_pre = get_xml_with_pre_from_zip_file_component (xml_sps_file_path , xml_file )
159+ xml_with_pre .zip_file_path = xml_sps_file_path
160+ response ["xml_with_pre" ] = xml_with_pre
161+ yield response
162+ except Exception as e :
163+ if not capture_errors :
164+ raise GetXMLWithPreFromZipFileError (f"Error in { xml_sps_file_path } /{ xml_file } " )
165+ response ["error" ] = str (e )
166+ response ["type_error" ] = type (e ).__name__
167+ yield response
140168
141- basenames = list (os .path .basename (n ) for n in zip_files if n )
142- for xml_file in xml_files :
143- try :
144- xml_with_pre = get_xml_with_pre (zf .read (xml_file ).decode ("utf-8" ))
145- xml_with_pre .zip_file_path = xml_sps_file_path
146- yield {
147- "filename" : xml_file ,
148- "xml_with_pre" : xml_with_pre ,
149- "files" : check_files ,
150- "filenames" : basenames ,
151- }
152- except Exception as e :
153- if not capture_errors :
154- LOGGER .exception (f"Error in { xml_sps_file_path } /{ xml_file } " )
155-
156- yield {
157- "filename" : xml_file ,
158- "files" : check_files ,
159- "filenames" : basenames ,
160- "error" : str (e ),
161- "type_error" : type (e ).__name__ ,
162- }
163169 except Exception as e :
164- LOGGER .exception (e )
165170 if not capture_errors :
166- raise GetXMLItemsFromZipFileError (
171+ raise GetXMLWithPreFromZipFileError (
167172 _ ("Unable to get xml items from zip file {}: {} {}" ).format (
168173 xml_sps_file_path , type (e ).__name__ , e
169174 )
170175 )
171176 yield {
172- "files" : filenames ,
177+ "files" : paths ,
173178 "filenames" : basenames ,
174179 "error" : str (e ),
175180 "type_error" : type (e ).__name__ ,
176181 }
177182
178183
184+ def get_xml_items_from_zip_file (
185+ xml_sps_file_path , filenames = None ,
186+ ):
187+ """
188+ Extract and process XML items from a ZIP file.
189+
190+ Parameters
191+ ----------
192+ xml_sps_file_path : str
193+ Path to the ZIP file
194+ filenames : list of str, optional
195+ Specific files to process. If None, processes all files.
196+
197+ Yields
198+ ------
199+ dict
200+ Success: {filename, xml_with_pre, files, filenames}
201+ XML error: {filename, files, filenames, error, type_error}
202+ ZIP error: {files, filenames, error, type_error}
203+
204+ Notes
205+ -----
206+ Yields errors as dicts instead of raising. Check for 'error' key.
207+ """
208+ basenames = []
209+ zip_components = []
210+ xml_files = []
211+ with ZipFile (xml_sps_file_path ) as zf :
212+ zip_components = zf .namelist ()
213+ basenames = list (os .path .basename (n ) for n in zip_components if n )
214+
215+ for item in zip_components :
216+ if not item .endswith (".xml" ):
217+ continue
218+
219+ basename = os .path .basename (item )
220+ if basename .startswith ("." ):
221+ continue
222+
223+ if not filenames or basename in filenames :
224+ xml_files .append ((basename , item ))
225+ return {
226+ "basenames" : basenames ,
227+ "paths" : zip_components ,
228+ "xml_files" : xml_files ,
229+ }
230+
231+
232+ def get_xml_with_pre_from_zip_file_component (xml_sps_file_path , xml_file ):
233+ with ZipFile (xml_sps_file_path ) as zf :
234+ zf_read = zf .read (xml_file )
235+ try :
236+ return get_xml_with_pre (zf_read .decode ("utf-8" ))
237+ except Exception as e :
238+ return get_xml_with_pre (zf_read .decode ("iso-8859-1" ))
239+
240+
179241def update_zip_file_xml (xml_sps_file_path , xml_file_path , content ):
180242 """
181243 Save XML content in a Zip file.
@@ -282,7 +344,7 @@ def get_xml_with_pre(xml_content):
282344 )
283345 try :
284346 return XMLWithPre (pref , etree .fromstring (xml ))
285- except etree .XMLSyntaxError :
347+ except etree .XMLSyntaxError as e :
286348 return XMLWithPre (pref , etree .fromstring (fix_pre_loading (xml )))
287349 except Exception as e :
288350 if xml_content :
@@ -320,7 +382,7 @@ def split_processing_instruction_doctype_declaration_and_xml(xml_content):
320382 if p >= 0 :
321383 return xml_content [:p ], xml_content [p :]
322384
323- return "" , xml_content
385+ return "" , xml_content . strip ()
324386
325387
326388class XMLWithPre :
0 commit comments