Skip to content

Commit cc62cf0

Browse files
committed
fixed psi-mi ontology date error
1 parent f3c5fa1 commit cc62cf0

1 file changed

Lines changed: 94 additions & 2 deletions

File tree

ontograph/loader.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from abc import ABC, abstractmethod
99
from typing import Any
1010
import logging
11+
import re
12+
import tempfile
1113
from pathlib import Path
1214
from functools import cached_property
1315

@@ -168,6 +170,77 @@ def find_file_encoding(self, file: str | Path) -> str | None:
168170
result = from_path(file).best()
169171
return result.encoding
170172

173+
def _fix_malformed_dates(self, path_file: Path) -> Path:
174+
"""Fix malformed date formats in OBO files.
175+
176+
Some OBO files (e.g., PSI-MI) have dates in non-standard format:
177+
- Header: "date: 15:04:2021 22:57" instead of ISO format (DD:MM:YYYY)
178+
179+
When fastobo encounters a malformed header date, it appears to corrupt
180+
the date parser, causing it to fail when parsing term creation_date fields
181+
(even though those are in proper ISO format). The solution is to remove
182+
both the malformed header date AND all creation_date fields.
183+
184+
Args:
185+
path_file: Path to the original OBO file
186+
187+
Returns:
188+
Path to the fixed OBO file (original if no fixes needed, temp file if fixed)
189+
"""
190+
try:
191+
encoding = self.find_file_encoding(path_file)
192+
with open(path_file, 'r', encoding=encoding) as f:
193+
content = f.read()
194+
195+
# Check if the file has malformed header dates (DD:MM:YYYY format)
196+
malformed_date_pattern = r'^date: \d{2}:\d{2}:\d{4}.*\n'
197+
match = re.search(malformed_date_pattern, content, flags=re.MULTILINE)
198+
199+
if not match:
200+
# No malformed dates, return original file
201+
return path_file
202+
203+
logger.warning(
204+
f"Detected malformed date format in {path_file}, fixing..."
205+
)
206+
207+
# Remove the malformed header date line
208+
fixed_content = re.sub(
209+
malformed_date_pattern,
210+
'',
211+
content,
212+
flags=re.MULTILINE
213+
)
214+
215+
# Also remove all creation_date fields from terms
216+
# This is necessary because the malformed header date corrupts
217+
# fastobo's date parser, causing it to fail on creation_date fields
218+
fixed_content = re.sub(
219+
r'^creation_date:.*\n',
220+
'',
221+
fixed_content,
222+
flags=re.MULTILINE
223+
)
224+
225+
# Write to temporary file
226+
temp_file = tempfile.NamedTemporaryFile(
227+
mode='w',
228+
suffix='.obo',
229+
delete=False,
230+
encoding=encoding
231+
)
232+
temp_file.write(fixed_content)
233+
temp_file.close()
234+
235+
logger.info(
236+
f"Fixed malformed dates in {path_file}, using temporary file: {temp_file.name}"
237+
)
238+
return Path(temp_file.name)
239+
240+
except Exception as e:
241+
logger.warning(f"Failed to fix malformed dates: {e}, using original file")
242+
return path_file
243+
171244
def _load_ontology(
172245
self, path_file: Path
173246
) -> tuple[pronto.Ontology, str | None]:
@@ -189,18 +262,37 @@ def _load_ontology(
189262
logger.error(error_msg)
190263
raise FileNotFoundError(error_msg)
191264

192-
logger.debug(f'Parsing ontology file with Pronto: {path_file}')
265+
# Fix malformed dates if needed
266+
fixed_path = self._fix_malformed_dates(path_file)
267+
268+
logger.debug(f'Parsing ontology file with Pronto: {fixed_path}')
193269
try:
194270
ontology: pronto.Ontology = pronto.Ontology(
195-
path_file, encoding=self.find_file_encoding(path_file)
271+
fixed_path, encoding=self.find_file_encoding(fixed_path)
196272
)
197273
except (TypeError, ValueError) as e:
198274
error_msg = f'Failed to load ontology from {path_file}: {str(e)}'
199275
logger.exception(error_msg)
276+
# Clean up temp file if it was created
277+
if fixed_path != path_file:
278+
try:
279+
fixed_path.unlink()
280+
except Exception:
281+
pass
200282
raise ValueError(error_msg) from e
201283

202284
ontology_id: str | None = self._extract_ontology_id(ontology)
203285
logger.debug(f'Loaded ontology with ID: {ontology_id}')
286+
287+
# Clean up temp file after successful loading (if different from original)
288+
# Note: We keep the temp file until after ontology is fully loaded
289+
if fixed_path != path_file:
290+
try:
291+
fixed_path.unlink()
292+
logger.debug(f'Cleaned up temporary file: {fixed_path}')
293+
except Exception as e:
294+
logger.warning(f'Failed to clean up temporary file {fixed_path}: {e}')
295+
204296
return ontology, ontology_id
205297

206298
def _create_ontology_object(

0 commit comments

Comments
 (0)