Skip to content

Commit 18a94d9

Browse files
committed
make download loop more resilient
1 parent ea3a4e9 commit 18a94d9

File tree

2 files changed

+97
-14
lines changed

2 files changed

+97
-14
lines changed

elsevier_coordinate_extraction/download/api.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ async def download_articles(
7676
When ``progress_callback`` is provided it will be invoked after each record finishes processing.
7777
The callback receives the original record, the downloaded ``ArticleContent`` when successful
7878
(``None`` when no payload is returned), and the exception raised while processing
79-
(``None`` on success). Callbacks may be synchronous or async functions.
79+
(``None`` on success). Callbacks may be synchronous or async functions. Download processing
80+
continues through the full list even when individual records fail.
8081
"""
8182
if not records:
8283
return []
@@ -107,9 +108,6 @@ async def _runner() -> list[ArticleContent]:
107108
cache=cache,
108109
cache_namespace=cache_namespace,
109110
)
110-
except httpx.HTTPError as exc:
111-
await _emit_progress(record, None, exc)
112-
raise
113111
except Exception as exc:
114112
await _emit_progress(record, None, exc)
115113
continue

tests/download/test_api.py

Lines changed: 95 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ async def test_download_single_article_xml(test_dois: Sequence[str]) -> None:
5757

5858
@pytest.mark.asyncio()
5959
async def test_download_marks_truncated_full_text() -> None:
60-
"""When the payload lacks body content we should mark the view as STANDARD."""
60+
"""When the payload lacks body content we emit an error but keep going."""
6161

6262
doi = "10.1016/j.neucli.2007.12.007"
6363
payload = b"""
@@ -80,16 +80,30 @@ async def handler(request: httpx.Request) -> httpx.Response:
8080

8181
cfg = _test_settings()
8282
transport = httpx.MockTransport(handler)
83+
progress_calls: list[tuple[dict[str, str], ArticleContent | None, BaseException | None]] = []
84+
85+
def progress_cb(
86+
record: dict[str, str],
87+
article: ArticleContent | None,
88+
error: BaseException | None,
89+
) -> None:
90+
progress_calls.append((record, article, error))
91+
8392
async with ScienceDirectClient(cfg, transport=transport) as client:
84-
with pytest.raises(httpx.HTTPStatusError, match="metadata-only payload"):
85-
await download_articles([{"doi": doi}], client=client)
93+
articles = await download_articles([{"doi": doi}], client=client, progress_callback=progress_cb)
8694

8795
assert len(captured_requests) == 1
96+
assert articles == []
97+
assert len(progress_calls) == 1
98+
record, article, error = progress_calls[0]
99+
assert record["doi"] == doi
100+
assert article is None
101+
assert isinstance(error, httpx.HTTPStatusError)
88102

89103

90104
@pytest.mark.asyncio()
91105
async def test_download_errors_when_full_view_invalid(test_dois: Sequence[str]) -> None:
92-
"""Client should raise if Elsevier rejects FULL view requests."""
106+
"""Client reports invalid FULL view errors without raising."""
93107

94108
doi = test_dois[0]
95109

@@ -108,9 +122,24 @@ async def handler(request: httpx.Request) -> httpx.Response:
108122

109123
cfg = _test_settings()
110124
transport = httpx.MockTransport(handler)
125+
progress_calls: list[tuple[dict[str, str], ArticleContent | None, BaseException | None]] = []
126+
127+
def progress_cb(
128+
record: dict[str, str],
129+
article: ArticleContent | None,
130+
error: BaseException | None,
131+
) -> None:
132+
progress_calls.append((record, article, error))
133+
111134
async with ScienceDirectClient(cfg, transport=transport) as client:
112-
with pytest.raises(httpx.HTTPStatusError, match="rejected FULL view"):
113-
await download_articles([{"doi": doi}], client=client)
135+
articles = await download_articles([{"doi": doi}], client=client, progress_callback=progress_cb)
136+
137+
assert articles == []
138+
assert len(progress_calls) == 1
139+
record, article, error = progress_calls[0]
140+
assert record["doi"] == doi
141+
assert article is None
142+
assert isinstance(error, httpx.HTTPStatusError)
114143

115144

116145
@pytest.mark.asyncio()
@@ -271,7 +300,7 @@ def progress_cb(
271300

272301
@pytest.mark.asyncio()
273302
async def test_download_progress_callback_receives_errors(test_dois: Sequence[str]) -> None:
274-
"""Progress callback should receive exceptions before they propagate."""
303+
"""Progress callback should receive exceptions even when downloads do not raise."""
275304

276305
cfg = _test_settings()
277306
doi = test_dois[0]
@@ -290,12 +319,68 @@ async def progress_cb(
290319
await asyncio.sleep(0)
291320
progress_calls.append((record, article, error))
292321

293-
with pytest.raises(httpx.TimeoutException):
294-
async with ScienceDirectClient(cfg, transport=transport) as client:
295-
await download_articles([{"doi": doi}], client=client, progress_callback=progress_cb)
322+
async with ScienceDirectClient(cfg, transport=transport) as client:
323+
articles = await download_articles([{"doi": doi}], client=client, progress_callback=progress_cb)
296324

325+
assert articles == []
297326
assert len(progress_calls) == 1
298327
record, article, error = progress_calls[0]
299328
assert record["doi"] == doi
300329
assert article is None
301330
assert isinstance(error, httpx.TimeoutException)
331+
332+
333+
@pytest.mark.asyncio()
334+
async def test_download_continues_after_identifier_error(test_dois: Sequence[str]) -> None:
335+
"""Errors for one record should not prevent later records from being processed."""
336+
337+
cfg = _test_settings()
338+
bad_doi, good_doi = test_dois[:2]
339+
340+
call_counter = 0
341+
342+
async def handler(request: httpx.Request) -> httpx.Response:
343+
nonlocal call_counter
344+
call_counter += 1
345+
doi = request.url.path.rsplit("/", 1)[-1]
346+
if call_counter == 1:
347+
raise httpx.TimeoutException("simulated timeout")
348+
payload = f"""
349+
<article xmlns="http://www.elsevier.com/xml/svapi/article/dtd" xmlns:ce="http://www.elsevier.com/xml/common/dtd">
350+
<item-info>
351+
<doi>{doi}</doi>
352+
<pii>S105381192400679X</pii>
353+
</item-info>
354+
<ce:body><ce:para>{doi}</ce:para></ce:body>
355+
</article>
356+
""".encode("utf-8")
357+
return httpx.Response(
358+
200,
359+
content=payload,
360+
headers={"content-type": "application/xml"},
361+
request=request,
362+
)
363+
364+
transport = httpx.MockTransport(handler)
365+
progress_calls: list[tuple[dict[str, str], ArticleContent | None, BaseException | None]] = []
366+
367+
def progress_cb(
368+
record: dict[str, str],
369+
article: ArticleContent | None,
370+
error: BaseException | None,
371+
) -> None:
372+
progress_calls.append((record, article, error))
373+
374+
records = [{"doi": bad_doi}, {"doi": good_doi}]
375+
async with ScienceDirectClient(cfg, transport=transport) as client:
376+
articles = await download_articles(records, client=client, progress_callback=progress_cb)
377+
378+
assert len(articles) == 1
379+
assert articles[0].doi == good_doi
380+
assert len(progress_calls) == 2
381+
assert progress_calls[0][0]["doi"] == bad_doi
382+
assert progress_calls[0][1] is None
383+
assert isinstance(progress_calls[0][2], httpx.TimeoutException)
384+
assert progress_calls[1][0]["doi"] == good_doi
385+
assert progress_calls[1][1] is not None
386+
assert progress_calls[1][2] is None

0 commit comments

Comments
 (0)