neurostuff
diff --git a/‎README.md‎
Lines changed: 22 additions & 0 deletions b/‎README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎elsevier_coordinate_extraction/cli/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎elsevier_coordinate_extraction/cli/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎elsevier_coordinate_extraction/cli/inputs.py‎
Lines changed: 77 additions & 0 deletions b/‎elsevier_coordinate_extraction/cli/inputs.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎elsevier_coordinate_extraction/cli/main.py‎
Lines changed: 178 additions & 0 deletions b/‎elsevier_coordinate_extraction/cli/main.py‎
Lines changed: 178 additions & 0 deletions
@@ -30,3 +30,25 @@ downloaded = download_articles(records)
 coordinates = extract_coordinates(downloaded)
 print(coordinates)
 ```
+
+## Command-Line Interface
+
+After installing the package, the `elsevier-extract` script becomes available via `pip install .` (or from PyPI). It accepts three mutually exclusive identifier inputs:
+
+- `--pmids` for comma-separated PMIDs or a text file containing one PMID per line
+- `--dois` for comma-separated DOIs or a text file containing one DOI per line
+- `--jsonl` for a JSON Lines file where each line is `{"doi": "...", "pmid": "..."}`
+
+Additional flags allow users to skip writing specific outputs (`--skip-xml`, `--skip-text`, `--skip-tables`, `--skip-coordinates`), continue past failures (`--continue-on-error`), disable caching (`--no-cache`), or adjust verbosity (`-v/--verbose`, `-q/--quiet`). `--output-dir` controls the base directory for results, and the CLI honors `ELSEVIER_EXTRACTION_WORKERS` when no `--max-workers` override is provided.
+
+### Output layout
+
+Each article is saved under `output-dir/{identifier}` where `{identifier}` is the filesystem-friendly DOI (slashes replaced with `_`) or PMID. Inside that directory you will find:
+
+- `article.xml` – the raw XML payload
+- `metadata.json` – download metadata, rate-limit snapshot, and supplementary attachments
+- `text.txt` – formatted article text (title/abstract/body)
+- `coordinates.json` – NIMADS-style evaluation of extracted coordinates
+- `tables/*.csv` – extracted tables named after their labels/captions
+
+The CLI also appends every run to `manifest.jsonl` (with status, timing, and file list) and records failures in `errors.jsonl`, enabling audit and resumable processing.
@@ -0,0 +1,2 @@
+"""Command-line interface helpers for Elsevier coordinate extraction."""
+__all__ = ["main", "inputs", "outputs", "orchestrator"]
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Dict, List
+
+_RECORD_KEYS = {"doi", "pmid"}
+
+
+def _normalize_record(payload: Dict[str, str]) -> Dict[str, str]:
+    normalized = {
+        key: payload[key].strip()
+        for key in _RECORD_KEYS
+        if payload.get(key)
+    }
+    return normalized
+
+
+def parse_text_file(path: Path, key: str) -> List[Dict[str, str]]:
+    if key not in _RECORD_KEYS:
+        raise ValueError(f"Unsupported identifier key: {key}")
+    records: List[Dict[str, str]] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            records.append({key: line})
+    return records
+
+
+def parse_pmids(value: str) -> List[Dict[str, str]]:
+    path = Path(value)
+    if path.exists():
+        return parse_text_file(path, "pmid")
+    pmids = [item.strip() for item in value.split(",") if item.strip()]
+    return [{"pmid": pmid} for pmid in pmids]
+
+
+def parse_dois(value: str) -> List[Dict[str, str]]:
+    path = Path(value)
+    if path.exists():
+        return parse_text_file(path, "doi")
+    dois = [item.strip() for item in value.split(",") if item.strip()]
+    return [{"doi": doi} for doi in dois]
+
+
+def parse_jsonl(path: Path) -> List[Dict[str, str]]:
+    records: List[Dict[str, str]] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for line_num, raw in enumerate(fh, 1):
+            raw = raw.strip()
+            if not raw:
+                continue
+            try:
+                payload = json.loads(raw)
+            except json.JSONDecodeError as exc:  # pragma: no cover
+                raise ValueError(
+                    f"Invalid JSON on line {line_num}: {exc}"
+                ) from exc
+            record = _normalize_record(payload)
+            if not record:
+                raise ValueError(f"Record on line {line_num} lacks doi/pmid")
+            records.append(record)
+    return records
+
+
+def validate_records(records: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    validated = []
+    for record in records:
+        normalized = _normalize_record(record)
+        if not normalized:
+            raise ValueError(
+                "Each record must contain at least a doi or pmid."
+            )
+        validated.append(normalized)
+    return validated
@@ -0,0 +1,178 @@
+from __future__ import annotations
+
+import argparse
+import asyncio
+import sys
+from pathlib import Path
+
+from elsevier_coordinate_extraction.settings import get_settings
+
+from .inputs import (
+    parse_dois,
+    parse_jsonl,
+    parse_pmids,
+    validate_records,
+)
+from .orchestrator import process_articles
+
+
+DESCRIPTION = """
+Download Elsevier articles, extract text/tables/coordinates,
+and write structured outputs.
+"""
+
+EXAMPLES = """Examples:
+  # Download by PubMed IDs inline
+  elsevier-extract --pmids 12345678,23456789 --output-dir ./results
+
+  # Download by DOIs from file
+  elsevier-extract --dois dois.txt --output-dir ./results
+
+  # Batch from JSONL
+  elsevier-extract --jsonl identifiers.jsonl --continue-on-error
+"""
+
+
+def create_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="elsevier-extract",
+        description=DESCRIPTION,
+        epilog=EXAMPLES,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument(
+        "--pmids", type=str, help="Comma-separated PMIDs or file path"
+    )
+    input_group.add_argument(
+        "--dois", type=str, help="Comma-separated DOIs or file path"
+    )
+    input_group.add_argument(
+        "--jsonl", type=Path, help="JSONL file with doi/pmid records"
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("./elsevier_output"),
+        help="Base directory for article outputs",
+    )
+    parser.add_argument(
+        "--skip-xml",
+        action="store_true",
+        help="Skip writing raw XML",
+    )
+    parser.add_argument(
+        "--skip-text",
+        action="store_true",
+        help="Skip writing extracted text",
+    )
+    parser.add_argument(
+        "--skip-tables",
+        action="store_true",
+        help="Skip writing extracted tables",
+    )
+    parser.add_argument(
+        "--skip-coordinates",
+        action="store_true",
+        help="Skip writing coordinates JSON",
+    )
+    parser.add_argument(
+        "--continue-on-error",
+        action="store_true",
+        help="Keep going after failures",
+    )
+    parser.add_argument(
+        "--max-workers",
+        type=int,
+        help="Override extraction worker count",
+    )
+    parser.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Disable response caching",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Print progress messages",
+    )
+    parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Minimal console output",
+    )
+    return parser
+
+
+def gather_records(args: argparse.Namespace) -> list[dict[str, str]]:
+    if args.pmids:
+        return parse_pmids(args.pmids)
+    if args.dois:
+        return parse_dois(args.dois)
+    if args.jsonl:
+        return parse_jsonl(args.jsonl)
+    return []
+
+
+async def async_main(args: argparse.Namespace) -> int:
+    try:
+        records = validate_records(gather_records(args))
+    except Exception as exc:
+        print(f"Input error: {exc}", file=sys.stderr)
+        return 1
+
+    if not records:
+        print("No identifiers were provided.", file=sys.stderr)
+        return 1
+
+    settings = get_settings()
+    if args.max_workers:
+        settings = settings.__class__(
+            **{
+                **settings.__dict__,
+                "extraction_workers": args.max_workers,
+            }
+        )
+
+    if not args.quiet:
+        print(f"Processing {len(records)} article(s)...")
+
+    try:
+        stats = await process_articles(
+            records,
+            args.output_dir,
+            settings=settings,
+            skip_xml=args.skip_xml,
+            skip_text=args.skip_text,
+            skip_tables=args.skip_tables,
+            skip_coordinates=args.skip_coordinates,
+            continue_on_error=args.continue_on_error,
+            use_cache=not args.no_cache,
+            verbose=args.verbose,
+        )
+    except Exception as exc:
+        print(f"Processing failed: {exc}", file=sys.stderr)
+        return 1
+
+    if not args.quiet:
+        summary = (
+            "\nSummary: success="
+            f"{stats['success']} failed={stats['failed']} "
+            f"skipped={stats['skipped']}"
+        )
+        print(summary)
+
+    return 0 if stats["failed"] == 0 else 1
+
+
+def main() -> int:
+    parser = create_parser()
+    args = parser.parse_args()
+    try:
+        return asyncio.run(async_main(args))
+    except KeyboardInterrupt:
+        print("\nInterrupted", file=sys.stderr)
+        return 130
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+"""Command-line interface helpers for Elsevier coordinate extraction."""`
	`2`	`+__all__ = ["main", "inputs", "outputs", "orchestrator"]`