Skip to content

Commit 8ce0c86

Browse files
authored
Merge pull request #33 from adelavega/enh/read_elsevier
ENH: Enable reading in elsevier like inputs
2 parents 10716bf + a44d9da commit 8ce0c86

4 files changed

Lines changed: 188 additions & 82 deletions

File tree

autonima/config.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,11 +150,7 @@ def _validate_config(self, config: PipelineConfig) -> None:
150150
ConfigurationError: If configuration is invalid
151151
"""
152152
# Validate search configuration
153-
if config.search.pmids_file or config.search.pmids_list:
154-
# PMID-based search
155-
if config.search.query.strip():
156-
raise ConfigurationError("Cannot specify both search query and PMIDs list/file")
157-
else:
153+
if not (config.search.pmids_file or config.search.pmids_list):
158154
# Query-based search
159155
if not config.search.query.strip():
160156
raise ConfigurationError("Search query cannot be empty when not using PMIDs list/file")
@@ -205,6 +201,13 @@ def _validate_config(self, config: PipelineConfig) -> None:
205201
# Validate output directory
206202
if not config.output.directory.strip():
207203
raise ConfigurationError("Output directory cannot be empty")
204+
205+
# config.retrieval.coordinates_path_templates is mutually exclusive with processed_data_path for each source
206+
for source in config.retrieval.sources:
207+
if source.coordinates_path_templates and source.processed_data_path:
208+
raise ConfigurationError(
209+
"coordinates_path_templates and processed_data_path are mutually exclusive for each source"
210+
)
208211

209212
def get_config(self) -> PipelineConfig:
210213
"""Get the currently loaded configuration."""

autonima/coordinates/processor.py

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
"""Coordinate parsing processor for the pipeline."""
22

33
import logging
4-
import csv
5-
from pathlib import Path
64
from typing import List
75

86
from .openai_client import CoordinateParsingClient
@@ -37,25 +35,16 @@ def process_single_table(self, table):
3735
List of analyses extracted from the table
3836
"""
3937
try:
40-
# Load the table data
41-
for path_attr in self.path_preference:
42-
table_path_value = getattr(table, path_attr, None)
43-
if table_path_value:
44-
table_path = Path(table_path_value)
45-
break
46-
else:
38+
# Load the raw table content using the table's method
39+
table.load_raw_table()
40+
41+
# If we couldn't load the raw table content, return empty list
42+
if table.raw_table is None:
4743
logger.warning(f"No valid table path found for table: {table.table_id}")
4844
return []
49-
table_path = Path(table_path)
50-
if not table_path.exists():
51-
logger.warning(f"Table file not found: {table_path}")
52-
return []
5345

54-
# Read the table as text
55-
with open(table_path, "r", encoding="utf-8") as f:
56-
reader = csv.reader(f)
57-
rows = list(reader)
58-
table_text = "\n".join([",".join(r) for r in rows])
46+
# Use the raw_table content directly
47+
table_text = table.raw_table
5948

6049
# Create a prompt for the table
6150
prompt = self._create_table_prompt(
@@ -69,7 +58,7 @@ def process_single_table(self, table):
6958
return result.analyses
7059

7160
except Exception as e:
72-
logger.warning(f"Error processing table {table.table_path}: {e}")
61+
logger.warning(f"Error processing table {table.table_id}: {e}")
7362
return []
7463

7564
def _create_table_prompt(self, table_text: str, table_caption: str = "", table_foot: str = "") -> str:

autonima/models/types.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Type definitions and data models for Autonima."""
22

3+
import logging
34
from dataclasses import dataclass, field
45
from typing import List, Dict, Any, Optional
56
from enum import Enum
@@ -28,6 +29,39 @@ class ActivationTable:
2829
table_foot: Optional[str] = None # Footer of the table
2930
table_data_path: Optional[str] = None # Path to processed table data file
3031
table_raw_path: Optional[str] = None # Path to raw table data file
32+
raw_table: Optional[str] = None # Raw table XML content
33+
34+
def load_raw_table(self) -> None:
35+
"""
36+
Load the raw table content from the preferred path.
37+
38+
This method populates the raw_table attribute with the content
39+
of the raw table file. If the raw_table attribute is already
40+
populated, it doesn't need to do anything. It keeps the
41+
preference hierarchy for which path to use for populating it.
42+
"""
43+
# If raw_table is already populated, no need to do anything
44+
if self.raw_table is not None:
45+
return
46+
47+
# Define the path preference hierarchy
48+
path_preference = ['table_raw_path', 'table_data_path']
49+
50+
# Try to load the raw table content from the preferred paths
51+
for path_attr in path_preference:
52+
table_path_value = getattr(self, path_attr, None)
53+
if table_path_value:
54+
try:
55+
with open(table_path_value, 'r', encoding='utf-8') as f:
56+
self.raw_table = f.read()
57+
return
58+
except Exception as e:
59+
logging.warning(
60+
f"Failed to load raw table from {table_path_value}: {e}"
61+
)
62+
continue
63+
64+
logging.warning(f"No valid table path found for table: {self.table_id}")
3165

3266

3367
@dataclass

0 commit comments

Comments
 (0)