Skip to content

Commit 18c0da2

Browse files
committed
do not use standard view fallback
1 parent 584deb9 commit 18c0da2

File tree

8 files changed

+324
-193
lines changed

8 files changed

+324
-193
lines changed

elsevier_coordinate_extraction/client.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,26 @@ async def _request(
127127
headers=request_headers,
128128
)
129129
delay = rate_limits.get_retry_delay(response)
130+
max_wait = self._settings.max_rate_limit_wait
131+
if (
132+
delay is not None
133+
and response.status_code == 429
134+
and max_wait is not None
135+
and delay > max_wait
136+
):
137+
snapshot = rate_limits.get_rate_limit_snapshot(response)
138+
wait_seconds = snapshot.seconds_until_reset() or delay
139+
message = (
140+
"Rate limit reset wait "
141+
f"({wait_seconds:g}s) exceeds configured maximum "
142+
f"({max_wait:g}s)."
143+
)
144+
raise httpx.HTTPStatusError(
145+
message
146+
+ " Increase ELSEVIER_MAX_RATE_LIMIT_WAIT_SECONDS to allow longer waits.",
147+
request=response.request,
148+
response=response,
149+
)
130150
if (
131151
delay is not None
132152
and response.status_code in {429, 500, 503}

elsevier_coordinate_extraction/download/api.py

Lines changed: 60 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import httpx
1010
from lxml import etree
1111

12+
from elsevier_coordinate_extraction import rate_limits
1213
from elsevier_coordinate_extraction.client import ScienceDirectClient
1314
from urllib.parse import urlparse
1415

@@ -74,6 +75,8 @@ async def _runner() -> list[ArticleContent]:
7475
cache=cache,
7576
cache_namespace=cache_namespace,
7677
)
78+
except httpx.HTTPError:
79+
raise
7780
except Exception:
7881
continue
7982
if article is None:
@@ -154,17 +157,36 @@ async def _download_identifier(
154157
metadata["transport"] = "cache"
155158

156159
view_used = initial_view
160+
response_for_metadata: httpx.Response | None = None
157161
if payload is None:
158-
params = {"httpAccept": "text/xml", "view": view_used}
159162
path = _endpoint_path_for_identifier(identifier, identifier_type)
160-
response = await client.request(
161-
"GET",
162-
path,
163-
params=params,
164-
accept="application/xml",
165-
)
163+
params = {"httpAccept": "text/xml", "view": view_used}
164+
try:
165+
response = await client.request(
166+
"GET",
167+
path,
168+
params=params,
169+
accept="application/xml",
170+
)
171+
except httpx.HTTPStatusError as exc:
172+
if (
173+
view_used == "FULL"
174+
and exc.response.status_code == 400
175+
and _is_invalid_view_error(exc.response)
176+
):
177+
message = (
178+
"ScienceDirect rejected FULL view for "
179+
f"{identifier_type}:{identifier}. Ensure your credentials grant full-text access."
180+
)
181+
raise httpx.HTTPStatusError(
182+
message,
183+
request=exc.request,
184+
response=exc.response,
185+
) from exc
186+
raise
166187
payload = response.content
167188
content_type = response.headers.get("content-type", "application/xml")
189+
response_for_metadata = response
168190
metadata.update(
169191
{
170192
"transport": response.request.url.scheme,
@@ -176,14 +198,29 @@ async def _download_identifier(
176198
"identifier_type": identifier_type,
177199
}
178200
)
201+
snapshot = rate_limits.get_rate_limit_snapshot(response)
202+
metadata.update(snapshot.to_metadata())
179203
if cache is not None:
180204
await cache.set(cache_namespace, cache_key, payload)
181205

182206
full_text = _payload_contains_full_text(payload)
183207
inferred_view = "FULL" if full_text else "STANDARD"
184-
metadata.setdefault("view_requested", initial_view)
185-
metadata.setdefault("view_obtained", inferred_view)
186-
metadata.setdefault("view", metadata.get("view", inferred_view))
208+
if initial_view == "FULL" and not full_text:
209+
message = (
210+
"ScienceDirect returned metadata-only payload when FULL view was requested. "
211+
"Confirm your entitlements allow full-text retrieval."
212+
)
213+
if response_for_metadata is not None:
214+
raise httpx.HTTPStatusError(
215+
message,
216+
request=response_for_metadata.request,
217+
response=response_for_metadata,
218+
)
219+
raise RuntimeError(message + " Cached payload violates requirement.")
220+
221+
metadata["view_requested"] = metadata.get("view_requested", initial_view)
222+
metadata["view_obtained"] = inferred_view
223+
metadata["view"] = inferred_view
187224
metadata["full_text_retrieved"] = full_text
188225

189226
pii = _extract_pii(payload)
@@ -343,3 +380,16 @@ def _guess_cdn_url(api_url: str, extension: str | None) -> str | None:
343380
else:
344381
filename = f"{filename}.{extension}"
345382
return f"{_CDN_BASE}/{filename}"
383+
384+
385+
def _is_invalid_view_error(response: httpx.Response) -> bool:
386+
"""Detect Elsevier errors indicating the requested view is unsupported."""
387+
388+
status_header = response.headers.get("X-ELS-Status", "").lower()
389+
if "view" in status_header and "invalid" in status_header:
390+
return True
391+
try:
392+
body_text = response.text.lower()
393+
except Exception: # pragma: no cover - defensive fallback
394+
return False
395+
return "view" in body_text and "not valid" in body_text

elsevier_coordinate_extraction/rate_limits.py

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,37 @@
22

33
from __future__ import annotations
44

5+
from dataclasses import dataclass
56
from datetime import datetime, timezone
67
from email.utils import parsedate_to_datetime
78

89
import httpx
910

1011

12+
@dataclass(frozen=True)
13+
class RateLimitSnapshot:
14+
"""Structured view over rate-limit response headers."""
15+
16+
limit: int | None
17+
remaining: int | None
18+
reset_epoch: float | None
19+
20+
def seconds_until_reset(self) -> float | None:
21+
"""Return seconds remaining until reset, if known."""
22+
if self.reset_epoch is None:
23+
return None
24+
now = datetime.now(timezone.utc).timestamp()
25+
return max(self.reset_epoch - now, 0.0)
26+
27+
def to_metadata(self) -> dict[str, float | int | None]:
28+
"""Convert snapshot into serializable metadata."""
29+
return {
30+
"rate_limit_limit": self.limit,
31+
"rate_limit_remaining": self.remaining,
32+
"rate_limit_reset_epoch": self.reset_epoch,
33+
}
34+
35+
1136
def get_retry_delay(response: httpx.Response) -> float | None:
1237
"""Return a suggested delay (seconds) before retrying a request.
1338
@@ -16,6 +41,7 @@ def get_retry_delay(response: httpx.Response) -> float | None:
1641
provided we attempt to derive a delay from ``X-RateLimit-Reset``.
1742
"""
1843

44+
snapshot = get_rate_limit_snapshot(response)
1945
retry_after = response.headers.get("Retry-After")
2046
if retry_after:
2147
try:
@@ -31,14 +57,33 @@ def get_retry_delay(response: httpx.Response) -> float | None:
3157
delta = (dt - now).total_seconds()
3258
return max(delta, 0.0)
3359

34-
reset = response.headers.get("X-RateLimit-Reset")
35-
if reset:
60+
if snapshot.reset_epoch is not None:
61+
delay = snapshot.seconds_until_reset()
62+
if delay and delay > 0:
63+
return delay
64+
return None
65+
66+
67+
def get_rate_limit_snapshot(response: httpx.Response) -> RateLimitSnapshot:
68+
"""Collect structured rate-limit header information from a response."""
69+
70+
def _parse_int(value: str | None) -> int | None:
71+
if value is None:
72+
return None
3673
try:
37-
reset_epoch = float(reset)
74+
return int(value)
3875
except ValueError:
3976
return None
40-
now = datetime.now(timezone.utc).timestamp()
41-
delay = reset_epoch - now
42-
if delay > 0:
43-
return delay
44-
return None
77+
78+
def _parse_float(value: str | None) -> float | None:
79+
if value is None:
80+
return None
81+
try:
82+
return float(value)
83+
except ValueError:
84+
return None
85+
86+
limit = _parse_int(response.headers.get("X-RateLimit-Limit"))
87+
remaining = _parse_int(response.headers.get("X-RateLimit-Remaining"))
88+
reset_epoch = _parse_float(response.headers.get("X-RateLimit-Reset"))
89+
return RateLimitSnapshot(limit=limit, remaining=remaining, reset_epoch=reset_epoch)

elsevier_coordinate_extraction/settings.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
_DEFAULT_CONCURRENCY: Final[int] = 4
1515
_DEFAULT_CACHE_DIR: Final[str] = ".elsevier_cache"
1616
_DEFAULT_USER_AGENT: Final[str] = "elsevierCoordinateExtraction/0.1.0"
17+
_DEFAULT_MAX_RATE_LIMIT_WAIT: Final[float] = 3600.0 # 1 hour
1718

1819
_CACHED_SETTINGS: Settings | None = None
1920

@@ -32,6 +33,7 @@ class Settings:
3233
http_proxy: str | None
3334
https_proxy: str | None
3435
use_proxy: bool
36+
max_rate_limit_wait: float | None
3537

3638

3739
_TRUE_VALUES: Final[set[str]] = {"1", "true", "yes", "on"}
@@ -86,6 +88,16 @@ def get_settings(*, force_reload: bool = False) -> Settings:
8688
default=default_use_proxy,
8789
)
8890

91+
max_wait_raw = os.getenv("ELSEVIER_MAX_RATE_LIMIT_WAIT_SECONDS")
92+
if max_wait_raw is None or not max_wait_raw.strip():
93+
max_rate_limit_wait: float | None = _DEFAULT_MAX_RATE_LIMIT_WAIT
94+
else:
95+
normalized = max_wait_raw.strip().lower()
96+
if normalized in {"none", "infinite", "inf", "unlimited"}:
97+
max_rate_limit_wait = None
98+
else:
99+
max_rate_limit_wait = max(float(max_wait_raw), 0.0)
100+
89101
_CACHED_SETTINGS = Settings(
90102
api_key=api_key,
91103
base_url=base_url,
@@ -97,5 +109,6 @@ def get_settings(*, force_reload: bool = False) -> Settings:
97109
http_proxy=http_proxy,
98110
https_proxy=https_proxy,
99111
use_proxy=use_proxy,
112+
max_rate_limit_wait=max_rate_limit_wait,
100113
)
101114
return _CACHED_SETTINGS

0 commit comments

Comments
 (0)