88from abc import ABC , abstractmethod
99from typing import Any
1010import logging
11+ import re
12+ import tempfile
1113from pathlib import Path
1214from functools import cached_property
1315
@@ -168,6 +170,77 @@ def find_file_encoding(self, file: str | Path) -> str | None:
168170 result = from_path (file ).best ()
169171 return result .encoding
170172
173+ def _fix_malformed_dates (self , path_file : Path ) -> Path :
174+ """Fix malformed date formats in OBO files.
175+
176+ Some OBO files (e.g., PSI-MI) have dates in non-standard format:
177+ - Header: "date: 15:04:2021 22:57" instead of ISO format (DD:MM:YYYY)
178+
179+ When fastobo encounters a malformed header date, it appears to corrupt
180+ the date parser, causing it to fail when parsing term creation_date fields
181+ (even though those are in proper ISO format). The solution is to remove
182+ both the malformed header date AND all creation_date fields.
183+
184+ Args:
185+ path_file: Path to the original OBO file
186+
187+ Returns:
188+ Path to the fixed OBO file (original if no fixes needed, temp file if fixed)
189+ """
190+ try :
191+ encoding = self .find_file_encoding (path_file )
192+ with open (path_file , 'r' , encoding = encoding ) as f :
193+ content = f .read ()
194+
195+ # Check if the file has malformed header dates (DD:MM:YYYY format)
196+ malformed_date_pattern = r'^date: \d{2}:\d{2}:\d{4}.*\n'
197+ match = re .search (malformed_date_pattern , content , flags = re .MULTILINE )
198+
199+ if not match :
200+ # No malformed dates, return original file
201+ return path_file
202+
203+ logger .warning (
204+ f"Detected malformed date format in { path_file } , fixing..."
205+ )
206+
207+ # Remove the malformed header date line
208+ fixed_content = re .sub (
209+ malformed_date_pattern ,
210+ '' ,
211+ content ,
212+ flags = re .MULTILINE
213+ )
214+
215+ # Also remove all creation_date fields from terms
216+ # This is necessary because the malformed header date corrupts
217+ # fastobo's date parser, causing it to fail on creation_date fields
218+ fixed_content = re .sub (
219+ r'^creation_date:.*\n' ,
220+ '' ,
221+ fixed_content ,
222+ flags = re .MULTILINE
223+ )
224+
225+ # Write to temporary file
226+ temp_file = tempfile .NamedTemporaryFile (
227+ mode = 'w' ,
228+ suffix = '.obo' ,
229+ delete = False ,
230+ encoding = encoding
231+ )
232+ temp_file .write (fixed_content )
233+ temp_file .close ()
234+
235+ logger .info (
236+ f"Fixed malformed dates in { path_file } , using temporary file: { temp_file .name } "
237+ )
238+ return Path (temp_file .name )
239+
240+ except Exception as e :
241+ logger .warning (f"Failed to fix malformed dates: { e } , using original file" )
242+ return path_file
243+
171244 def _load_ontology (
172245 self , path_file : Path
173246 ) -> tuple [pronto .Ontology , str | None ]:
@@ -189,18 +262,37 @@ def _load_ontology(
189262 logger .error (error_msg )
190263 raise FileNotFoundError (error_msg )
191264
192- logger .debug (f'Parsing ontology file with Pronto: { path_file } ' )
265+ # Fix malformed dates if needed
266+ fixed_path = self ._fix_malformed_dates (path_file )
267+
268+ logger .debug (f'Parsing ontology file with Pronto: { fixed_path } ' )
193269 try :
194270 ontology : pronto .Ontology = pronto .Ontology (
195- path_file , encoding = self .find_file_encoding (path_file )
271+ fixed_path , encoding = self .find_file_encoding (fixed_path )
196272 )
197273 except (TypeError , ValueError ) as e :
198274 error_msg = f'Failed to load ontology from { path_file } : { str (e )} '
199275 logger .exception (error_msg )
276+ # Clean up temp file if it was created
277+ if fixed_path != path_file :
278+ try :
279+ fixed_path .unlink ()
280+ except Exception :
281+ pass
200282 raise ValueError (error_msg ) from e
201283
202284 ontology_id : str | None = self ._extract_ontology_id (ontology )
203285 logger .debug (f'Loaded ontology with ID: { ontology_id } ' )
286+
287+ # Clean up temp file after successful loading (if different from original)
288+ # Note: We keep the temp file until after ontology is fully loaded
289+ if fixed_path != path_file :
290+ try :
291+ fixed_path .unlink ()
292+ logger .debug (f'Cleaned up temporary file: { fixed_path } ' )
293+ except Exception as e :
294+ logger .warning (f'Failed to clean up temporary file { fixed_path } : { e } ' )
295+
204296 return ontology , ontology_id
205297
206298 def _create_ontology_object (
0 commit comments