Skip to content

Commit e004628

Browse files
authored
feat: chemrxiv API much faster (#89)
1 parent 3c0e328 commit e004628

File tree

4 files changed

+140
-61
lines changed

4 files changed

+140
-61
lines changed

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,16 @@ This is enough to query PubMed, arXiv or Google Scholar.
4141

4242
#### Download X-rxiv Dumps
4343

44-
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire dump is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line).
44+
However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
4545

4646
```py
4747
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
48-
medrxiv() # Takes ~30min and should result in ~35 MB file
49-
biorxiv() # Takes ~1h and should result in ~350 MB file
50-
chemrxiv() # Takes ~45min and should result in ~20 MB file
48+
chemrxiv() # Takes 30min -> +30K papers (~50 MB file)
49+
medrxiv() # Takes <1h -> +90K papers (~200 MB file)
50+
biorxiv() # Up to 6h -> +400K papers (~800 MB file)
5151
```
5252
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
53-
*NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
53+
*NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
5454

5555
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
5656
```py

paperscraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Initialize the module."""
22

33
__name__ = "paperscraper"
4-
__version__ = "0.3.2"
4+
__version__ = "0.3.3"
55

66
import logging
77
import os

paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py

Lines changed: 115 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,19 @@
22
import os
33
import sys
44
from datetime import datetime
5-
from time import time
5+
from time import sleep
66
from typing import Dict, Optional
77
from urllib.parse import urljoin
88

99
import requests
10-
from requests.exceptions import ChunkedEncodingError
10+
from requests.exceptions import (
11+
ChunkedEncodingError,
12+
ConnectionError,
13+
ContentDecodingError,
14+
JSONDecodeError,
15+
ReadTimeout,
16+
)
17+
from urllib3.exceptions import DecodeError
1118

1219
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
1320
logger = logging.getLogger(__name__)
@@ -72,63 +79,128 @@ def __init__(
7279
else:
7380
self.end_date = now_datetime.strftime("%Y-%m-%d")
7481

75-
def request(self, url, method, params=None):
82+
def request(self, url, method, params=None, parse_json: bool = False):
7683
"""Send an API request to open Engage."""
7784

85+
headers = {"Accept-Encoding": "identity", "Accept": "application/json"}
86+
retryable = (
87+
ChunkedEncodingError,
88+
ContentDecodingError,
89+
DecodeError,
90+
ReadTimeout,
91+
ConnectionError,
92+
)
93+
transient_status = {429, 500, 502, 503, 504}
94+
backoff = 0.1
95+
7896
for attempt in range(self.max_retries):
7997
try:
8098
if method.casefold() == "get":
81-
return requests.get(url, params=params, timeout=10)
99+
response = requests.get(
100+
url, params=params, headers=headers, timeout=(5, 30)
101+
)
82102
elif method.casefold() == "post":
83-
return requests.post(url, json=params, timeout=10)
103+
response = requests.post(
104+
url, json=params, headers=headers, timeout=(5, 30)
105+
)
84106
else:
85107
raise ConnectionError(f"Unknown method for query: {method}")
86-
except ChunkedEncodingError as e:
87-
logger.warning(f"ChunkedEncodingError occurred for {url}: {e}")
108+
if response.status_code in transient_status:
109+
logger.warning(
110+
f"{response.status_code} for {url} (attempt {attempt + 1}/{self.max_retries}); retrying in {backoff:.1f}s"
111+
)
112+
if attempt + 1 == self.max_retries:
113+
response.raise_for_status()
114+
sleep(backoff)
115+
backoff = min(60.0, backoff * 2)
116+
continue
117+
elif 400 <= response.status_code < 500:
118+
response.raise_for_status()
119+
if not parse_json:
120+
return response
121+
122+
try:
123+
return response.json()
124+
except JSONDecodeError:
125+
logger.warning(
126+
f"JSONDecodeError for {response.url} "
127+
f"(attempt {attempt + 1}/{self.max_retries}); retrying in {backoff:.1f}s"
128+
)
129+
if attempt + 1 == self.max_retries:
130+
raise
131+
sleep(backoff)
132+
backoff = min(60.0, backoff * 2)
133+
continue
134+
135+
except retryable as e:
136+
logger.warning(
137+
f"{e.__class__.__name__} for {url} (attempt {attempt + 1}/{self.max_retries}); "
138+
f"retrying in {backoff:.1f}s"
139+
)
88140
if attempt + 1 == self.max_retries:
89-
raise e
90-
time.sleep(3)
141+
raise
142+
sleep(backoff)
143+
backoff = min(60.0, backoff * 2)
91144

92145
def query(self, query, method="get", params=None):
93146
"""Perform a direct query."""
94147

95-
r = self.request(urljoin(self.base, query), method, params=params)
96-
r.raise_for_status()
97-
return r.json()
148+
return self.request(
149+
urljoin(self.base, query), method, params=params, parse_json=True
150+
)
98151

99-
def query_generator(self, query, method: str = "get", params: Dict = {}):
152+
def query_generator(
153+
self, query, method: str = "get", params: Optional[Dict] = None
154+
):
100155
"""Query for a list of items, with paging. Returns a generator."""
101156

102-
try:
103-
total = self.number_of_preprints()
104-
except Exception:
105-
total = float("inf") # fallback if that call fails
106-
107-
page = 0
108-
while True:
109-
params.update(
110-
{
111-
"limit": self.page_size,
112-
"skip": page * self.page_size,
113-
"searchDateFrom": self.start_date,
114-
"searchDateTo": self.end_date,
115-
}
116-
)
117-
if page * self.page_size > total:
118-
break
119-
r = self.request(urljoin(self.base, query), method, params=params)
120-
if r.status_code == 400:
121-
raise ValueError(r.json()["message"])
122-
r.raise_for_status()
123-
r = r.json()
124-
r = r["itemHits"]
125-
126-
# If we have no more results, bail out
127-
if len(r) == 0:
128-
return
129-
130-
yield from r
131-
page += 1
157+
start_datetime = datetime.fromisoformat(self.start_date)
158+
end_datetime = datetime.fromisoformat(self.end_date)
159+
160+
def year_windows():
161+
year = start_datetime.year
162+
while year <= end_datetime.year:
163+
year_start = datetime(year, 1, 1)
164+
year_end = datetime(year, 12, 31)
165+
win_start = max(start_datetime, year_start)
166+
win_end = min(end_datetime, year_end)
167+
yield win_start.strftime("%Y-%m-%d"), win_end.strftime("%Y-%m-%d")
168+
year += 1
169+
170+
params = (params or {}).copy()
171+
172+
for year_from, year_to in year_windows():
173+
logger.info(f"Starting to scrape data from {year_from} to {year_to}")
174+
page = 0
175+
while True:
176+
params.update(
177+
{
178+
"limit": self.page_size,
179+
"skip": page * self.page_size,
180+
"searchDateFrom": year_from,
181+
"searchDateTo": year_to,
182+
}
183+
)
184+
try:
185+
data = self.request(
186+
urljoin(self.base, query),
187+
method,
188+
params=params,
189+
parse_json=True,
190+
)
191+
except requests.HTTPError as e:
192+
status = getattr(e.response, "status_code", None)
193+
logger.warning(
194+
f"Stopping year window {year_from}..{year_to} at skip={page * self.page_size} "
195+
f"due to HTTPError {status}"
196+
)
197+
break
198+
items = data.get("itemHits", [])
199+
if not items:
200+
break
201+
for item in items:
202+
yield item
203+
page += 1
132204

133205
def all_preprints(self):
134206
"""Return a generator to all the chemRxiv articles."""

paperscraper/get_dumps/utils/chemrxiv/utils.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,15 @@
77
from datetime import datetime
88
from typing import Dict, List, Optional
99

10-
from requests.exceptions import SSLError
10+
from requests.exceptions import (
11+
ChunkedEncodingError,
12+
ContentDecodingError,
13+
JSONDecodeError,
14+
SSLError,
15+
)
1116
from requests.models import HTTPError
1217
from tqdm import tqdm
18+
from urllib3.exceptions import DecodeError
1319

1420
from .chemrxiv_api import ChemrxivAPI
1521

@@ -127,20 +133,21 @@ def parse_dump(source_path: str, target_path: str) -> None:
127133
def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:
128134
if api is None:
129135
api = ChemrxivAPI()
130-
131136
os.makedirs(save_dir, exist_ok=True)
137+
132138
for preprint in tqdm(api.all_preprints()):
133-
path = os.path.join(save_dir, f"{preprint['item']['id']}.json")
139+
item = preprint["item"]
140+
path = os.path.join(save_dir, f"{item['id']}.json")
134141
if os.path.exists(path):
135142
continue
136-
preprint = preprint["item"]
137-
preprint_id = preprint["id"]
138-
try:
139-
preprint = api.preprint(preprint_id)
140-
except HTTPError:
141-
logger.warning(f"HTTP API Client error for ID: {preprint_id}")
142-
except SSLError:
143-
logger.warning(f"SSLError for ID: {preprint_id}")
143+
144+
if not item.get("title") or "authors" not in item:
145+
try:
146+
item = api.preprint(item["id"])
147+
except Exception as e:
148+
logger.warning(
149+
f"Enrich failed for {item['id']}: {e}; writing listing payload"
150+
)
144151

145152
with open(path, "w") as file:
146-
json.dump(preprint, file, indent=2)
153+
json.dump(item, file, indent=2)

0 commit comments

Comments
 (0)