22
33from __future__ import annotations
44
5+ import inspect
56import re
6- from collections .abc import Mapping , Sequence
7- from typing import Any
7+ from collections .abc import Awaitable , Mapping , Sequence
8+ from typing import Any , Protocol
89
910import httpx
1011from lxml import etree
4344_CDN_BASE = "https://ars.els-cdn.com/content/image"
4445
4546
47+ class ProgressCallback (
48+ Protocol ,
49+ ):
50+ """Callback invoked after each record is processed."""
51+
52+ def __call__ (
53+ self ,
54+ record : Mapping [str , str ],
55+ article : ArticleContent | None ,
56+ error : BaseException | None ,
57+ ) -> Awaitable [None ] | None : ...
58+
59+
4660async def download_articles (
4761 records : Sequence [Mapping [str , str ]],
4862 * ,
4963 client : ScienceDirectClient | None = None ,
5064 cache : Any | None = None ,
5165 cache_namespace : str = "articles" ,
5266 settings : Settings | None = None ,
67+ progress_callback : ProgressCallback | None = None ,
5368) -> list [ArticleContent ]:
5469 """Download ScienceDirect articles identified by DOI and/or PubMed ID records.
5570
5671 Each record in ``records`` should contain at least one of the keys ``"doi"`` or ``"pmid"``.
5772 For every record, the downloader first attempts to retrieve the FULL text using the DOI
5873 (when present); if that fails, it retries with the PubMed ID. A successful download using
5974 either identifier stops further attempts for that record.
75+
76+ When ``progress_callback`` is provided it will be invoked after each record finishes processing.
77+ The callback receives the original record, the downloaded ``ArticleContent`` when successful
78+ (``None`` when no payload is returned), and the exception raised while processing
79+ (``None`` on success). Callbacks may be synchronous or async functions.
6080 """
6181 if not records :
6282 return []
@@ -65,23 +85,39 @@ async def download_articles(
6585 owns_client = client is None
6686 sci_client = client or ScienceDirectClient (cfg )
6787
88+ async def _emit_progress (
89+ record : Mapping [str , str ],
90+ article : ArticleContent | None ,
91+ error : BaseException | None ,
92+ ) -> None :
93+ if progress_callback is None :
94+ return
95+ result = progress_callback (record , article , error )
96+ if inspect .isawaitable (result ):
97+ await result
98+
6899 async def _runner () -> list [ArticleContent ]:
69100 results : list [ArticleContent ] = []
70101 for record in records :
102+ article : ArticleContent | None = None
71103 try :
72104 article = await _download_record (
73105 record = record ,
74106 client = sci_client ,
75107 cache = cache ,
76108 cache_namespace = cache_namespace ,
77109 )
78- except httpx .HTTPError :
110+ except httpx .HTTPError as exc :
111+ await _emit_progress (record , None , exc )
79112 raise
80- except Exception :
113+ except Exception as exc :
114+ await _emit_progress (record , None , exc )
81115 continue
82116 if article is None :
117+ await _emit_progress (record , None , None )
83118 continue
84119 results .append (article )
120+ await _emit_progress (record , article , None )
85121 return results
86122
87123 if owns_client :
0 commit comments