Skip to content

Commit c57ac1f

Browse files
committed
add a cli
1 parent 18a94d9 commit c57ac1f

File tree

9 files changed

+704
-0
lines changed

9 files changed

+704
-0
lines changed

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,25 @@ downloaded = download_articles(records)
3030
coordinates = extract_coordinates(downloaded)
3131
print(coordinates)
3232
```
33+
34+
## Command-Line Interface
35+
36+
After installing the package, the `elsevier-extract` script becomes available via `pip install .` (or from PyPI). It accepts three mutually exclusive identifier inputs:
37+
38+
- `--pmids` for comma-separated PMIDs or a text file containing one PMID per line
39+
- `--dois` for comma-separated DOIs or a text file containing one DOI per line
40+
- `--jsonl` for a JSON Lines file where each line is `{"doi": "...", "pmid": "..."}`
41+
42+
Additional flags allow users to skip writing specific outputs (`--skip-xml`, `--skip-text`, `--skip-tables`, `--skip-coordinates`), continue past failures (`--continue-on-error`), disable caching (`--no-cache`), or adjust verbosity (`-v/--verbose`, `-q/--quiet`). `--output-dir` controls the base directory for results, and the CLI honors `ELSEVIER_EXTRACTION_WORKERS` when no `--max-workers` override is provided.
43+
44+
### Output layout
45+
46+
Each article is saved under `output-dir/{identifier}` where `{identifier}` is the filesystem-friendly DOI (slashes replaced with `_`) or PMID. Inside that directory you will find:
47+
48+
- `article.xml` – the raw XML payload
49+
- `metadata.json` – download metadata, rate-limit snapshot, and supplementary attachments
50+
- `text.txt` – formatted article text (title/abstract/body)
51+
- `coordinates.json` – NIMADS-style evaluation of extracted coordinates
52+
- `tables/*.csv` – extracted tables named after their labels/captions
53+
54+
The CLI also appends every run to `manifest.jsonl` (with status, timing, and file list) and records failures in `errors.jsonl`, enabling audit and resumable processing.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"""Command-line interface helpers for Elsevier coordinate extraction."""
2+
__all__ = ["main", "inputs", "outputs", "orchestrator"]
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from __future__ import annotations
2+
3+
import json
4+
from pathlib import Path
5+
from typing import Dict, List
6+
7+
_RECORD_KEYS = {"doi", "pmid"}
8+
9+
10+
def _normalize_record(payload: Dict[str, str]) -> Dict[str, str]:
11+
normalized = {
12+
key: payload[key].strip()
13+
for key in _RECORD_KEYS
14+
if payload.get(key)
15+
}
16+
return normalized
17+
18+
19+
def parse_text_file(path: Path, key: str) -> List[Dict[str, str]]:
20+
if key not in _RECORD_KEYS:
21+
raise ValueError(f"Unsupported identifier key: {key}")
22+
records: List[Dict[str, str]] = []
23+
with path.open("r", encoding="utf-8") as fh:
24+
for line in fh:
25+
line = line.strip()
26+
if not line or line.startswith("#"):
27+
continue
28+
records.append({key: line})
29+
return records
30+
31+
32+
def parse_pmids(value: str) -> List[Dict[str, str]]:
33+
path = Path(value)
34+
if path.exists():
35+
return parse_text_file(path, "pmid")
36+
pmids = [item.strip() for item in value.split(",") if item.strip()]
37+
return [{"pmid": pmid} for pmid in pmids]
38+
39+
40+
def parse_dois(value: str) -> List[Dict[str, str]]:
41+
path = Path(value)
42+
if path.exists():
43+
return parse_text_file(path, "doi")
44+
dois = [item.strip() for item in value.split(",") if item.strip()]
45+
return [{"doi": doi} for doi in dois]
46+
47+
48+
def parse_jsonl(path: Path) -> List[Dict[str, str]]:
49+
records: List[Dict[str, str]] = []
50+
with path.open("r", encoding="utf-8") as fh:
51+
for line_num, raw in enumerate(fh, 1):
52+
raw = raw.strip()
53+
if not raw:
54+
continue
55+
try:
56+
payload = json.loads(raw)
57+
except json.JSONDecodeError as exc: # pragma: no cover
58+
raise ValueError(
59+
f"Invalid JSON on line {line_num}: {exc}"
60+
) from exc
61+
record = _normalize_record(payload)
62+
if not record:
63+
raise ValueError(f"Record on line {line_num} lacks doi/pmid")
64+
records.append(record)
65+
return records
66+
67+
68+
def validate_records(records: List[Dict[str, str]]) -> List[Dict[str, str]]:
69+
validated = []
70+
for record in records:
71+
normalized = _normalize_record(record)
72+
if not normalized:
73+
raise ValueError(
74+
"Each record must contain at least a doi or pmid."
75+
)
76+
validated.append(normalized)
77+
return validated
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
from __future__ import annotations
2+
3+
import argparse
4+
import asyncio
5+
import sys
6+
from pathlib import Path
7+
8+
from elsevier_coordinate_extraction.settings import get_settings
9+
10+
from .inputs import (
11+
parse_dois,
12+
parse_jsonl,
13+
parse_pmids,
14+
validate_records,
15+
)
16+
from .orchestrator import process_articles
17+
18+
19+
DESCRIPTION = """
20+
Download Elsevier articles, extract text/tables/coordinates,
21+
and write structured outputs.
22+
"""
23+
24+
EXAMPLES = """Examples:
25+
# Download by PubMed IDs inline
26+
elsevier-extract --pmids 12345678,23456789 --output-dir ./results
27+
28+
# Download by DOIs from file
29+
elsevier-extract --dois dois.txt --output-dir ./results
30+
31+
# Batch from JSONL
32+
elsevier-extract --jsonl identifiers.jsonl --continue-on-error
33+
"""
34+
35+
36+
def create_parser() -> argparse.ArgumentParser:
37+
parser = argparse.ArgumentParser(
38+
prog="elsevier-extract",
39+
description=DESCRIPTION,
40+
epilog=EXAMPLES,
41+
formatter_class=argparse.RawDescriptionHelpFormatter,
42+
)
43+
input_group = parser.add_mutually_exclusive_group(required=True)
44+
input_group.add_argument(
45+
"--pmids", type=str, help="Comma-separated PMIDs or file path"
46+
)
47+
input_group.add_argument(
48+
"--dois", type=str, help="Comma-separated DOIs or file path"
49+
)
50+
input_group.add_argument(
51+
"--jsonl", type=Path, help="JSONL file with doi/pmid records"
52+
)
53+
54+
parser.add_argument(
55+
"--output-dir",
56+
type=Path,
57+
default=Path("./elsevier_output"),
58+
help="Base directory for article outputs",
59+
)
60+
parser.add_argument(
61+
"--skip-xml",
62+
action="store_true",
63+
help="Skip writing raw XML",
64+
)
65+
parser.add_argument(
66+
"--skip-text",
67+
action="store_true",
68+
help="Skip writing extracted text",
69+
)
70+
parser.add_argument(
71+
"--skip-tables",
72+
action="store_true",
73+
help="Skip writing extracted tables",
74+
)
75+
parser.add_argument(
76+
"--skip-coordinates",
77+
action="store_true",
78+
help="Skip writing coordinates JSON",
79+
)
80+
parser.add_argument(
81+
"--continue-on-error",
82+
action="store_true",
83+
help="Keep going after failures",
84+
)
85+
parser.add_argument(
86+
"--max-workers",
87+
type=int,
88+
help="Override extraction worker count",
89+
)
90+
parser.add_argument(
91+
"--no-cache",
92+
action="store_true",
93+
help="Disable response caching",
94+
)
95+
parser.add_argument(
96+
"--verbose",
97+
"-v",
98+
action="store_true",
99+
help="Print progress messages",
100+
)
101+
parser.add_argument(
102+
"--quiet",
103+
"-q",
104+
action="store_true",
105+
help="Minimal console output",
106+
)
107+
return parser
108+
109+
110+
def gather_records(args: argparse.Namespace) -> list[dict[str, str]]:
111+
if args.pmids:
112+
return parse_pmids(args.pmids)
113+
if args.dois:
114+
return parse_dois(args.dois)
115+
if args.jsonl:
116+
return parse_jsonl(args.jsonl)
117+
return []
118+
119+
120+
async def async_main(args: argparse.Namespace) -> int:
121+
try:
122+
records = validate_records(gather_records(args))
123+
except Exception as exc:
124+
print(f"Input error: {exc}", file=sys.stderr)
125+
return 1
126+
127+
if not records:
128+
print("No identifiers were provided.", file=sys.stderr)
129+
return 1
130+
131+
settings = get_settings()
132+
if args.max_workers:
133+
settings = settings.__class__(
134+
**{
135+
**settings.__dict__,
136+
"extraction_workers": args.max_workers,
137+
}
138+
)
139+
140+
if not args.quiet:
141+
print(f"Processing {len(records)} article(s)...")
142+
143+
try:
144+
stats = await process_articles(
145+
records,
146+
args.output_dir,
147+
settings=settings,
148+
skip_xml=args.skip_xml,
149+
skip_text=args.skip_text,
150+
skip_tables=args.skip_tables,
151+
skip_coordinates=args.skip_coordinates,
152+
continue_on_error=args.continue_on_error,
153+
use_cache=not args.no_cache,
154+
verbose=args.verbose,
155+
)
156+
except Exception as exc:
157+
print(f"Processing failed: {exc}", file=sys.stderr)
158+
return 1
159+
160+
if not args.quiet:
161+
summary = (
162+
"\nSummary: success="
163+
f"{stats['success']} failed={stats['failed']} "
164+
f"skipped={stats['skipped']}"
165+
)
166+
print(summary)
167+
168+
return 0 if stats["failed"] == 0 else 1
169+
170+
171+
def main() -> int:
172+
parser = create_parser()
173+
args = parser.parse_args()
174+
try:
175+
return asyncio.run(async_main(args))
176+
except KeyboardInterrupt:
177+
print("\nInterrupted", file=sys.stderr)
178+
return 130

0 commit comments

Comments
 (0)