Skip to content

Commit 31a2525

Browse files
committed
Merge branch 'master' of github.com:adelavega/autonima
2 parents b6437c7 + 7c11e1a commit 31a2525

12 files changed

Lines changed: 246 additions & 155 deletions

File tree

autonima.egg-info/PKG-INFO

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Requires-Dist: datetime
3333
Requires-Dist: biopython>=1.81
3434
Requires-Dist: pandas>=2.0
3535
Requires-Dist: matplotlib>=3.5
36+
Requires-Dist: pubget>=0.0.8
3637
Provides-Extra: dev
3738
Requires-Dist: pytest>=7.0; extra == "dev"
3839
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -143,17 +144,19 @@ Where `sample_config.yaml` specifies:
143144
1. Define your review specification (`YAML` or `JSON`):
144145

145146
```yaml
146-
objective: "Identify fMRI studies of working memory in schizophrenia"
147147
search:
148148
database: "pubmed"
149149
query: "schizophrenia AND working memory AND fMRI"
150-
inclusion_criteria:
151-
- Human participants
152-
- fMRI neuroimaging
153-
- Case-control design
154-
exclusion_criteria:
155-
- Animal studies
156-
- Review articles
150+
screening:
151+
abstract:
152+
objective: "Identify fMRI studies of working memory in schizophrenia"
153+
inclusion_criteria:
154+
- Human participants
155+
- fMRI neuroimaging
156+
- Case-control design
157+
exclusion_criteria:
158+
- Animal studies
159+
- Review articles
157160
```
158161

159162
2. Run Autonima pipeline:

autonima.egg-info/SOURCES.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ autonima/__main__.py
55
autonima/cli.py
66
autonima/config.py
77
autonima/pipeline.py
8+
autonima/utils.py
89
autonima.egg-info/PKG-INFO
910
autonima.egg-info/SOURCES.txt
1011
autonima.egg-info/dependency_links.txt
@@ -13,6 +14,11 @@ autonima.egg-info/requires.txt
1314
autonima.egg-info/top_level.txt
1415
autonima/models/__init__.py
1516
autonima/models/types.py
17+
autonima/retrieval/__init__.py
18+
autonima/retrieval/ace.py
19+
autonima/retrieval/base.py
20+
autonima/retrieval/pubget.py
21+
autonima/retrieval/utils.py
1622
autonima/screening/__init__.py
1723
autonima/screening/base.py
1824
autonima/screening/openai_client.py
@@ -23,6 +29,13 @@ autonima/search/__init__.py
2329
autonima/search/base.py
2430
autonima/search/pubmed.py
2531
tests/test_basic.py
32+
tests/test_confidence_reporting.py
33+
tests/test_fulltext_loading.py
34+
tests/test_fulltext_screening.py
35+
tests/test_objective_in_prompt.py
36+
tests/test_parallel_screening.py
37+
tests/test_pipeline_retrieval.py
2638
tests/test_pubmed.py
39+
tests/test_retrieval.py
2740
tests/test_screening.py
2841
tests/test_simplified_screening.py

autonima.egg-info/requires.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ datetime
99
biopython>=1.81
1010
pandas>=2.0
1111
matplotlib>=3.5
12+
pubget>=0.0.8
1213

1314
[dev]
1415
pytest>=7.0

autonima/cli.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -88,29 +88,11 @@ def run(
8888
# Set output directory to the specified output folder
8989
pipeline_config.output.directory = output_folder
9090

91-
# Log screening info
92-
abstract_config = pipeline_config.screening.abstract
93-
abstract_objective = abstract_config.get('objective')
94-
if abstract_objective:
95-
logger.info(f"Abstract screening objective: {abstract_objective}")
96-
97-
fulltext_config = pipeline_config.screening.fulltext
98-
fulltext_objective = fulltext_config.get('objective')
99-
if fulltext_objective:
100-
logger.info(f"Fulltext screening objective: {fulltext_objective}")
101-
102-
logger.info(f"Search database: {pipeline_config.search.database}")
103-
logger.info(f"Search query: {pipeline_config.search.query}")
104-
logger.info(f"Output directory: {pipeline_config.output.directory}")
105-
10691
if dry_run:
10792
logger.info("Dry run completed successfully")
10893
logger.info("Configuration is valid")
10994
return
11095

111-
# Run the pipeline
112-
logger.info("Starting pipeline execution...")
113-
11496
async def execute_pipeline():
11597
results = await run_pipeline_from_config(
11698
config=pipeline_config, num_workers=num_workers

autonima/models/types.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class Study:
3838
retrieved_at: Optional[datetime] = None
3939
screened_at: Optional[datetime] = None
4040
pmcid: Optional[str] = None
41+
full_text_path: Optional[str] = None
4142

4243
def to_dict(self) -> Dict[str, Any]:
4344
"""Convert study to dictionary representation."""
@@ -63,6 +64,7 @@ def to_dict(self) -> Dict[str, Any]:
6364
"screened_at": (
6465
self.screened_at.isoformat() if self.screened_at else None
6566
),
67+
"full_text_path": self.full_text_path,
6668
}
6769

6870
def load_full_text(self, output_dir: str) -> str:
@@ -127,6 +129,8 @@ class RetrievalConfig:
127129
max_retries: int = 3
128130
download_directory: str = "downloads"
129131
n_jobs: int = 1
132+
# Optional full text source configuration
133+
full_text_source: Optional[Dict[str, Any]] = None
130134

131135

132136
@dataclass
@@ -168,6 +172,7 @@ def to_dict(self) -> Dict[str, Any]:
168172
"max_retries": self.retrieval.max_retries,
169173
"download_directory": self.retrieval.download_directory,
170174
"n_jobs": self.retrieval.n_jobs,
175+
"full_text_source": self.retrieval.full_text_source,
171176
},
172177
"output": {
173178
"directory": self.output.directory,

autonima/pipeline.py

Lines changed: 78 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,6 @@ async def run(self) -> PipelineResult:
122122

123123
async def _execute_search_phase(self):
124124
"""Execute the literature search phase."""
125-
logger.info("Starting literature search phase")
126125

127126
if not self._search_engine:
128127
raise RuntimeError("Search engine not initialized")
@@ -153,12 +152,8 @@ async def _execute_search_phase(self):
153152
import json
154153
json.dump(search_data, f, indent=2)
155154

156-
logger.info(f"Search completed: found {len(studies)} studies")
157-
logger.info(f"Search results saved to {search_results_file}")
158-
159155
async def _execute_abstract_screening(self):
160156
"""Execute abstract screening phase."""
161-
logger.info("Starting abstract screening phase")
162157

163158
# Get studies that need screening
164159
pending_studies = [
@@ -212,13 +207,9 @@ async def _execute_abstract_screening(self):
212207
f"Abstract screening completed: {screened_count} studies "
213208
"screened"
214209
)
215-
logger.info(
216-
f"Abstract screening results saved to {screening_results_file}"
217-
)
218210

219211
async def _execute_retrieval_phase(self):
220212
"""Execute full-text retrieval phase."""
221-
logger.info("Starting full-text retrieval phase")
222213

223214
# Get included studies that need full-text retrieval
224215
included_studies = [
@@ -235,9 +226,60 @@ async def _execute_retrieval_phase(self):
235226
if not self._retriever:
236227
raise RuntimeError("Retriever not initialized")
237228

238-
# Fetch PMCIDs for included studies that don't have them
229+
# Check for existing full texts from user-provided source
230+
studies_from_user_source = []
231+
studies_for_pubget = included_studies
232+
233+
# If full_text_source is configured, try to map PMIDs to existing texts
234+
if (hasattr(self.config.retrieval, 'full_text_source') and
235+
self.config.retrieval.full_text_source):
236+
237+
try:
238+
from .retrieval.utils import _map_pmids_to_text
239+
240+
# Get the configuration for the full text source
241+
full_text_config = self.config.retrieval.full_text_source
242+
243+
# Extract PMIDs from included studies
244+
pmids = [int(s.pmid) for s in included_studies if s.pmid.isdigit()]
245+
246+
# Map PMIDs to text files
247+
pmid_to_text_path = _map_pmids_to_text(
248+
root_path=full_text_config['root_path'],
249+
pmid_source=full_text_config['pmid_source'],
250+
text_path_templates=full_text_config.get('text_path_templates'),
251+
pmids_to_include=set(pmids),
252+
json_filename=full_text_config.get('json_filename', 'identifiers.json'),
253+
json_pmid_key=full_text_config.get('json_pmid_key', 'pmid'),
254+
allowed_extensions=full_text_config.get('allowed_extensions')
255+
)
256+
257+
# Update studies with their full text paths
258+
for study in included_studies:
259+
if int(study.pmid) in pmid_to_text_path:
260+
study.full_text_path = str(pmid_to_text_path[int(study.pmid)])
261+
study.status = StudyStatus.FULLTEXT_CACHED
262+
studies_from_user_source.append(study)
263+
264+
# Filter out studies that were found in the user source
265+
studies_for_pubget = [
266+
s for s in included_studies
267+
if s not in studies_from_user_source
268+
]
269+
270+
logger.info(
271+
f"Found {len(studies_from_user_source)} studies in user-provided "
272+
"full text source"
273+
)
274+
275+
except Exception as e:
276+
logger.warning(
277+
f"Failed to load from user-provided full text source: {e}"
278+
)
279+
280+
# Fetch PMCIDs for studies that will use PubGet (those without full_text_path)
239281
studies_needing_pmcid = [
240-
s for s in included_studies if not s.pmcid
282+
s for s in studies_for_pubget if not s.pmcid
241283
]
242284

243285
if studies_needing_pmcid:
@@ -255,34 +297,31 @@ async def _execute_retrieval_phase(self):
255297
else:
256298
not_found.append(study.pmid)
257299

258-
if not_found:
259-
logger.warning(
260-
f"PMCIDs not found for {len(not_found)} studies."
300+
# Use PubGet for actual retrieval (only for studies not found in user source)
301+
if studies_for_pubget:
302+
output_dir = Path(self.config.output.directory)
303+
retrieval_dir = output_dir / "retrieval"
304+
305+
# Retrieve full-text articles
306+
try:
307+
api_key = getattr(self.config.retrieval, 'api_key', None)
308+
n_docs = getattr(self.config.retrieval, 'max_docs', None)
309+
310+
_ = self._retriever.retrieve(
311+
studies=studies_for_pubget,
312+
output_dir=retrieval_dir,
313+
api_key=api_key,
314+
n_docs=n_docs
261315
)
316+
317+
except Exception as e:
318+
log_error_with_debug(logger, f"Full-text retrieval failed: {e}")
262319

263-
# Use PubGet for actual retrieval
264-
output_dir = Path(self.config.output.directory)
265-
retrieval_dir = output_dir / "retrieval"
266-
267-
# Retrieve full-text articles
268-
try:
269-
api_key = getattr(self.config.retrieval, 'api_key', None)
270-
n_docs = getattr(self.config.retrieval, 'max_docs', None)
271-
272-
_ = self._retriever.retrieve(
273-
studies=included_studies,
274-
output_dir=retrieval_dir,
275-
api_key=api_key,
276-
n_docs=n_docs
277-
)
278-
279-
except Exception as e:
280-
log_error_with_debug(logger, f"Full-text retrieval failed: {e}")
281-
282-
# Validate retrieval
283-
self._retriever.validate_retrieval(included_studies, retrieval_dir)
320+
# Validate retrieval
321+
self._retriever.validate_retrieval(studies_for_pubget, retrieval_dir)
284322

285323
# Save intermediary results
324+
output_dir = Path(self.config.output.directory)
286325
retrieval_results_file = output_dir / "outputs" / "fulltext_retrieval_results.json"
287326
retrieval_data = {
288327
"studies_with_fulltext": [
@@ -294,7 +333,8 @@ async def _execute_retrieval_phase(self):
294333
study.retrieved_at.isoformat()
295334
if study.retrieved_at else None
296335
),
297-
"status": study.status.value
336+
"status": study.status.value,
337+
"full_text_path": study.full_text_path
298338
}
299339
for study in self.results.studies
300340
if (study.status in [
@@ -328,15 +368,11 @@ async def _execute_retrieval_phase(self):
328368

329369
logger.info(
330370
f"Full-text retrieval completed: {retrieved_count} texts "
331-
f"retrieved, {unavailable_count} unavailable"
332-
)
333-
logger.info(
334-
f"Full-text retrieval results saved to {retrieval_results_file}"
371+
f"retrieved/cached, {unavailable_count} unavailable"
335372
)
336373

337374
async def _execute_fulltext_screening(self):
338375
"""Execute full-text screening phase."""
339-
logger.info("Starting full-text screening phase")
340376

341377
# Get studies with full text that need screening
342378
screenable_studies = [
@@ -394,20 +430,15 @@ async def _execute_fulltext_screening(self):
394430
logger.info(
395431
f"Full-text screening completed: {final_count} studies included"
396432
)
397-
logger.info(
398-
f"Full-text screening results saved to "
399-
f"{fulltext_screening_results_file}"
400-
)
401433

402434
async def _execute_output_phase(self):
403435
"""Execute output generation phase."""
404-
logger.info("Starting output generation phase")
405436

406437
# TODO: Implement comprehensive output generation
407438
# For now, generate basic statistics
408439
await self._generate_basic_outputs()
409440

410-
logger.info("Output generation completed")
441+
logger.info("Saved final results and statistics")
411442

412443
async def _generate_basic_outputs(self):
413444
"""Generate basic outputs and statistics."""
@@ -447,8 +478,6 @@ async def _generate_basic_outputs(self):
447478
import json
448479
json.dump(self.results.to_dict(final_studies_only=True), f, indent=2)
449480

450-
logger.info(f"Final results saved to {final_results_file}")
451-
452481
def get_statistics(self) -> Dict[str, Any]:
453482
"""Get pipeline execution statistics."""
454483
# Get objective from screening configuration

autonima/retrieval/pubget.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,6 @@ def retrieve(
8585
existing_pmcids = set(
8686
existing_df['pmcid'].dropna().astype(int).tolist()
8787
)
88-
logger.info(
89-
f"Found {len(existing_pmcids)} already downloaded "
90-
"PMCIDs"
91-
)
9288
except Exception as e:
9389
logger.warning(f"Could not read existing metadata: {e}")
9490

@@ -136,11 +132,9 @@ def retrieve(
136132
# Check to see how many articles were downloaded
137133
# If no articles were downloaded, skip the rest of the steps
138134
if not download_dir or not Path(download_dir).exists():
139-
logger.warning("No articles were downloaded.")
140135
return studies
141136
downloaded_files = list(Path(download_dir).rglob('*'))
142137
if len(downloaded_files) <= 1:
143-
logger.warning("No articles were downloaded.")
144138
return studies
145139

146140
# Extract articles from bulk download

0 commit comments

Comments
 (0)