Skip to content

Commit d33a003

Browse files
committed
Enable multiple sources
1 parent 31a2525 commit d33a003

4 files changed

Lines changed: 69 additions & 49 deletions

File tree

autonima/config.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,22 @@ def load_from_dict(self, config_dict: Dict[str, Any]) -> PipelineConfig:
8787
screening_dict['fulltext']
8888
)
8989

90-
retrieval_config = RetrievalConfig(
91-
**config_dict.get('retrieval', {})
92-
)
90+
retrieval_config = RetrievalConfig()
91+
if 'retrieval' in config_dict:
92+
retrieval_dict = config_dict['retrieval']
93+
# Handle backward compatibility for single full_text_source
94+
if 'full_text_source' in retrieval_dict:
95+
if retrieval_dict['full_text_source'] is not None:
96+
retrieval_config.full_text_sources = [retrieval_dict['full_text_source']]
97+
# Remove the old key to avoid conflicts
98+
retrieval_dict = {k: v for k, v in retrieval_dict.items() if k != 'full_text_source'}
99+
# Handle new full_text_sources
100+
if 'full_text_sources' in retrieval_dict:
101+
retrieval_config.full_text_sources = retrieval_dict['full_text_sources']
102+
# Set other retrieval config values
103+
for key, value in retrieval_dict.items():
104+
if hasattr(retrieval_config, key) and key != 'full_text_sources':
105+
setattr(retrieval_config, key, value)
93106
output_config = OutputConfig(**config_dict.get('output', {}))
94107

95108
# Create main config

autonima/models/types.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@ class RetrievalConfig:
129129
max_retries: int = 3
130130
download_directory: str = "downloads"
131131
n_jobs: int = 1
132-
# Optional full text source configuration
133-
full_text_source: Optional[Dict[str, Any]] = None
132+
# Optional full text source configurations
133+
full_text_sources: List[Dict[str, Any]] = field(default_factory=list)
134134

135135

136136
@dataclass
@@ -172,7 +172,7 @@ def to_dict(self) -> Dict[str, Any]:
172172
"max_retries": self.retrieval.max_retries,
173173
"download_directory": self.retrieval.download_directory,
174174
"n_jobs": self.retrieval.n_jobs,
175-
"full_text_source": self.retrieval.full_text_source,
175+
"full_text_sources": self.retrieval.full_text_sources,
176176
},
177177
"output": {
178178
"directory": self.output.directory,

autonima/pipeline.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -226,55 +226,55 @@ async def _execute_retrieval_phase(self):
226226
if not self._retriever:
227227
raise RuntimeError("Retriever not initialized")
228228

229-
# Check for existing full texts from user-provided source
229+
# Check for existing full texts from user-provided sources
230230
studies_from_user_source = []
231231
studies_for_pubget = included_studies
232232

233-
# If full_text_source is configured, try to map PMIDs to existing texts
234-
if (hasattr(self.config.retrieval, 'full_text_source') and
235-
self.config.retrieval.full_text_source):
233+
# If full_text_sources are configured, try to map PMIDs to existing texts
234+
if (hasattr(self.config.retrieval, 'full_text_sources') and
235+
self.config.retrieval.full_text_sources):
236236

237237
try:
238238
from .retrieval.utils import _map_pmids_to_text
239239

240-
# Get the configuration for the full text source
241-
full_text_config = self.config.retrieval.full_text_source
242-
243240
# Extract PMIDs from included studies
244241
pmids = [int(s.pmid) for s in included_studies if s.pmid.isdigit()]
242+
pmids_set = set(pmids)
245243

246-
# Map PMIDs to text files
247-
pmid_to_text_path = _map_pmids_to_text(
248-
root_path=full_text_config['root_path'],
249-
pmid_source=full_text_config['pmid_source'],
250-
text_path_templates=full_text_config.get('text_path_templates'),
251-
pmids_to_include=set(pmids),
252-
json_filename=full_text_config.get('json_filename', 'identifiers.json'),
253-
json_pmid_key=full_text_config.get('json_pmid_key', 'pmid'),
254-
allowed_extensions=full_text_config.get('allowed_extensions')
255-
)
256-
257-
# Update studies with their full text paths
258-
for study in included_studies:
259-
if int(study.pmid) in pmid_to_text_path:
260-
study.full_text_path = str(pmid_to_text_path[int(study.pmid)])
261-
study.status = StudyStatus.FULLTEXT_CACHED
262-
studies_from_user_source.append(study)
263-
264-
# Filter out studies that were found in the user source
265-
studies_for_pubget = [
266-
s for s in included_studies
267-
if s not in studies_from_user_source
268-
]
244+
# Process each full text source
245+
for i, full_text_config in enumerate(self.config.retrieval.full_text_sources):
246+
if not full_text_config:
247+
continue
248+
249+
logger.info(f"Processing full text source {i+1}/{len(self.config.retrieval.full_text_sources)}")
250+
251+
# Map PMIDs to text files
252+
pmid_to_text_path = _map_pmids_to_text(
253+
root_path=full_text_config['root_path'],
254+
pmid_source=full_text_config['pmid_source'],
255+
text_path_templates=full_text_config.get('text_path_templates'),
256+
pmids_to_include=pmids_set,
257+
json_filename=full_text_config.get('json_filename', 'identifiers.json'),
258+
json_pmid_key=full_text_config.get('json_pmid_key', 'pmid'),
259+
allowed_extensions=full_text_config.get('allowed_extensions')
260+
)
261+
262+
# Update studies with their full text paths
263+
for study in studies_for_pubget[:]: # Use a copy to safely modify during iteration
264+
if int(study.pmid) in pmid_to_text_path:
265+
study.full_text_path = str(pmid_to_text_path[int(study.pmid)])
266+
study.status = StudyStatus.FULLTEXT_CACHED
267+
studies_from_user_source.append(study)
268+
studies_for_pubget.remove(study) # Remove from studies_for_pubget
269269

270270
logger.info(
271271
f"Found {len(studies_from_user_source)} studies in user-provided "
272-
"full text source"
272+
"full text sources"
273273
)
274274

275275
except Exception as e:
276276
logger.warning(
277-
f"Failed to load from user-provided full text source: {e}"
277+
f"Failed to load from user-provided full text sources: {e}"
278278
)
279279

280280
# Fetch PMCIDs for studies that will use PubGet (those without full_text_path)

examples/sample_config.yml

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,24 @@ search:
1414
retrieval:
1515
sources:
1616
- pubget
17-
# full_text_source:
18-
# root_path: "/path/to/your/full/texts"
19-
# pmid_source: "folder_name" # or "json" or "file_name"
20-
# text_path_templates:
21-
# - "fulltext.txt"
22-
# - "text.txt"
23-
# # For pmid_source: "json", you can customize:
24-
# # json_filename: "identifiers.json"
25-
# # json_pmid_key: "pmid"
26-
# # For pmid_source: "file_name", you can customize:
27-
# # allowed_extensions: [".txt", ".xml"]
17+
# full_text_sources:
18+
# - root_path: "/path/to/your/first/full/texts"
19+
# pmid_source: "folder_name" # or "json" or "file_name"
20+
# text_path_templates:
21+
# - "fulltext.txt"
22+
# - "text.txt"
23+
# # For pmid_source: "json", you can customize:
24+
# # json_filename: "identifiers.json"
25+
# # json_pmid_key: "pmid"
26+
# # For pmid_source: "file_name", you can customize:
27+
# # allowed_extensions: [".txt", ".xml"]
28+
# - root_path: "/path/to/your/second/full/texts"
29+
# pmid_source: "json"
30+
# text_path_templates:
31+
# - "processed/pubget/text.txt"
32+
# - "text.txt"
33+
# json_filename: "identifiers.json"
34+
# json_pmid_key: "pmid"
2835

2936
screening:
3037
abstract:

0 commit comments

Comments
 (0)