Skip to content

Commit cd62473

Browse files
committed
create new package
0 parents  commit cd62473

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+35601
-0
lines changed

.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ELSEVIER_API_KEY=YOUR_API_KEY
2+
ELSEVIER_HTTP_PROXY=socks5://127.0.0.1:1080
3+
ELSEVIER_HTTPS_PROXY=socks5://127.0.0.1:1080

.gitignore

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
__pycache__/
2+
*.py[cod]
3+
*$py.class
4+
5+
# Virtual environments
6+
.venv/
7+
venv/
8+
env/
9+
.env.local
10+
11+
# Distribution / packaging
12+
.Python
13+
build/
14+
develop-eggs/
15+
dist/
16+
downloads/
17+
eggs/
18+
.eggs/
19+
lib/
20+
lib64/
21+
parts/
22+
sdist/
23+
var/
24+
wheels/
25+
share/python-wheels/
26+
*.egg-info/
27+
.installed.cfg
28+
*.egg
29+
30+
# Installer logs
31+
pip-log.txt
32+
pip-delete-this-directory.txt
33+
34+
# Unit test / coverage reports
35+
.tox/
36+
.pytest_cache/
37+
.nox/
38+
coverage.xml
39+
.coverage
40+
.coverage.*
41+
htmlcov/
42+
cache/
43+
44+
# Type checker caches
45+
.mypy_cache/
46+
.dmypy.json
47+
dmypy.json
48+
49+
# Ruff / lint caches
50+
.ruff_cache/
51+
52+
# IDE / editor
53+
.idea/
54+
.vscode/
55+
*.code-workspace
56+
57+
# OS files
58+
.DS_Store
59+
Thumbs.db
60+
61+
# Logs
62+
*.log
63+
64+
# Local notebooks
65+
.ipynb_checkpoints/
66+
67+
# pyenv
68+
.python-version
69+
70+
.elsevier_cache

README.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Elsvier Coordinate Extraction
2+
3+
This package provides tools to search, download, and
4+
extract coordinates from Elseivier articles.
5+
6+
## Installation
7+
8+
pip install elsvier-coordinate-extraction
9+
10+
or a local install:
11+
```bash
12+
git clone https://github.com/yourusername/elsevier-coordinate-extraction.git
13+
cd elsevier-coordinate-extraction
14+
pip install -e .
15+
```
16+
17+
## Usage
18+
19+
```python
20+
from elsvier_coordinate_extraction import search_articles, download_articles, extract_coordinates
21+
22+
# Search for articles
23+
articles = search_articles(query="fmri", max_results=5)
24+
25+
# Download full-text XML for the first article using its DOI/PMID
26+
records = [{"doi": articles[0]["doi"], "pmid": articles[0].get("pmid")}] # type: ignore[index]
27+
downloaded = download_articles(records)
28+
29+
# Extract coordinates
30+
coordinates = extract_coordinates(downloaded)
31+
print(coordinates)
32+
```
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
"""
2+
elsevier Coordinate Extraction package.
3+
4+
Module initialization sets up the exported API surface incrementally as
5+
components are implemented.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
# Public re-exports will be populated during implementation.
11+
12+
__all__: tuple[str, ...] = ()
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Disk-backed cache helpers."""
2+
3+
from __future__ import annotations
4+
5+
import asyncio
6+
import hashlib
7+
from pathlib import Path
8+
9+
10+
class FileCache:
11+
"""Simple asynchronous cache that stores payloads on disk."""
12+
13+
def __init__(self, root: Path | str) -> None:
14+
self._root = Path(root)
15+
self._root.mkdir(parents=True, exist_ok=True)
16+
17+
async def get(self, namespace: str, key: str) -> bytes | None:
18+
"""Return cached bytes if present."""
19+
path = self._path(namespace, key)
20+
if not path.exists():
21+
return None
22+
return await asyncio.to_thread(path.read_bytes)
23+
24+
async def set(self, namespace: str, key: str, data: bytes) -> None:
25+
"""Persist payload bytes for future reuse."""
26+
path = self._path(namespace, key)
27+
await asyncio.to_thread(self._write_atomic, path, data)
28+
29+
def _path(self, namespace: str, key: str) -> Path:
30+
digest = hashlib.sha256(key.encode("utf-8")).hexdigest()
31+
directory = self._root / namespace
32+
directory.mkdir(parents=True, exist_ok=True)
33+
return directory / f"{digest}.bin"
34+
35+
@staticmethod
36+
def _write_atomic(path: Path, data: bytes) -> None:
37+
path.parent.mkdir(parents=True, exist_ok=True)
38+
tmp_path = path.with_suffix(".tmp")
39+
tmp_path.write_bytes(data)
40+
tmp_path.replace(path)
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""Async ScienceDirect client built on httpx."""
2+
3+
from __future__ import annotations
4+
5+
import asyncio
6+
from collections.abc import Mapping
7+
from typing import Any
8+
9+
import httpx
10+
11+
from . import rate_limits
12+
from .settings import Settings
13+
14+
__all__ = ["ScienceDirectClient"]
15+
16+
17+
class ScienceDirectClient:
18+
"""Thin wrapper around httpx.AsyncClient with Elsevier defaults."""
19+
20+
def __init__(
21+
self,
22+
settings: Settings,
23+
*,
24+
transport: httpx.AsyncBaseTransport | None = None,
25+
max_retries: int = 3,
26+
) -> None:
27+
self._settings = settings
28+
self._transport = transport
29+
self._max_retries = max(0, max_retries)
30+
self._client: httpx.AsyncClient | None = None
31+
concurrency = settings.concurrency or 1
32+
self._semaphore = asyncio.Semaphore(concurrency)
33+
34+
async def __aenter__(self) -> ScienceDirectClient:
35+
await self._ensure_client()
36+
return self
37+
38+
async def __aexit__(self, exc_type, exc, tb) -> None: # type: ignore[override]
39+
if self._client is not None:
40+
await self._client.aclose()
41+
self._client = None
42+
43+
async def request(
44+
self,
45+
method: str,
46+
path: str,
47+
*,
48+
params: Mapping[str, Any] | None = None,
49+
accept: str | None = None,
50+
) -> httpx.Response:
51+
"""Perform an HTTP request and return the response."""
52+
return await self._request(method, path, params=params, accept=accept)
53+
54+
async def get_json(
55+
self,
56+
path: str,
57+
*,
58+
params: Mapping[str, Any] | None = None,
59+
) -> dict[str, Any]:
60+
"""Perform a GET request expecting JSON."""
61+
response = await self.request(
62+
"GET",
63+
path,
64+
params=params,
65+
accept="application/json",
66+
)
67+
return response.json()
68+
69+
async def get_xml(
70+
self,
71+
path: str,
72+
*,
73+
params: Mapping[str, Any] | None = None,
74+
) -> str:
75+
"""Perform a GET request expecting XML."""
76+
response = await self.request(
77+
"GET",
78+
path,
79+
params=params,
80+
accept="application/xml",
81+
)
82+
return response.text
83+
84+
async def _ensure_client(self) -> None:
85+
if self._client is not None:
86+
return
87+
headers: dict[str, str] = {
88+
"X-ELS-APIKey": self._settings.api_key,
89+
"User-Agent": self._settings.user_agent,
90+
}
91+
if self._settings.insttoken:
92+
headers["X-ELS-Insttoken"] = self._settings.insttoken
93+
timeout = httpx.Timeout(self._settings.timeout)
94+
proxy_value = self._settings.https_proxy or self._settings.http_proxy
95+
self._client = httpx.AsyncClient(
96+
base_url=self._settings.base_url,
97+
timeout=timeout,
98+
headers=headers,
99+
transport=self._transport,
100+
http2=True,
101+
proxy=proxy_value,
102+
)
103+
104+
async def _request(
105+
self,
106+
method: str,
107+
path: str,
108+
*,
109+
params: Mapping[str, Any] | None,
110+
accept: str | None,
111+
) -> httpx.Response:
112+
await self._ensure_client()
113+
assert self._client is not None
114+
attempt = 0
115+
while True:
116+
request_headers = {"Accept": accept} if accept else {}
117+
async with self._semaphore:
118+
response = await self._client.request(
119+
method,
120+
path,
121+
params=params,
122+
headers=request_headers,
123+
)
124+
delay = rate_limits.get_retry_delay(response)
125+
if (
126+
delay is not None
127+
and response.status_code in {429, 500, 503}
128+
and attempt < self._max_retries
129+
):
130+
await asyncio.sleep(delay)
131+
attempt += 1
132+
continue
133+
134+
try:
135+
response.raise_for_status()
136+
except httpx.HTTPStatusError as exc:
137+
raise exc
138+
return response
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""Download module public API."""
2+
3+
from __future__ import annotations
4+
5+
# Implementation forthcoming.

0 commit comments

Comments
 (0)