feat: chemrxiv API much faster (#89)

jannisborn · web-flow · commit e0046286b90f · 2025-11-07T14:59:16.000+01:00
diff --git a/README.md b/README.md
@@ -41,16 +41,16 @@ This is enough to query PubMed, arXiv or Google Scholar.
 
 #### Download X-rxiv Dumps
 
-However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire dump is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line).
+However, to scrape publication data from the preprint servers [biorxiv](https://www.biorxiv.org), [medrxiv](https://www.medrxiv.org) and [chemrxiv](https://www.chemrxiv.org), the setup is different. The entire history of papers is downloaded and stored in the `server_dumps` folder in a `.jsonl` format (one paper per line). This takes a while, as of November 2025:
 
 ```py
 from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
-medrxiv()  #  Takes ~30min and should result in ~35 MB file
-biorxiv()  # Takes ~1h and should result in ~350 MB file
-chemrxiv()  #  Takes ~45min and should result in ~20 MB file
+chemrxiv()  #  Takes 30min -> +30K papers (~50 MB file)
+medrxiv()  #  Takes <1h -> +90K papers (~200 MB file)
+biorxiv()  # Up to 6h -> +400K papers (~800 MB file)
 ```
 *NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect. 
-*NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
+*NOTE*: If you experience API connection issues, since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`.
 
 Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates.
 ```py
diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py
@@ -1,7 +1,7 @@
 """Initialize the module."""
 
 __name__ = "paperscraper"
-__version__ = "0.3.2"
+__version__ = "0.3.3"
 
 import logging
 import os
diff --git a/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py b/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
@@ -2,12 +2,19 @@
 import os
 import sys
 from datetime import datetime
-from time import time
+from time import sleep
 from typing import Dict, Optional
 from urllib.parse import urljoin
 
 import requests
-from requests.exceptions import ChunkedEncodingError
+from requests.exceptions import (
+    ChunkedEncodingError,
+    ConnectionError,
+    ContentDecodingError,
+    JSONDecodeError,
+    ReadTimeout,
+)
+from urllib3.exceptions import DecodeError
 
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -72,63 +79,128 @@ def __init__(
         else:
             self.end_date = now_datetime.strftime("%Y-%m-%d")
 
-    def request(self, url, method, params=None):
+    def request(self, url, method, params=None, parse_json: bool = False):
         """Send an API request to open Engage."""
 
+        headers = {"Accept-Encoding": "identity", "Accept": "application/json"}
+        retryable = (
+            ChunkedEncodingError,
+            ContentDecodingError,
+            DecodeError,
+            ReadTimeout,
+            ConnectionError,
+        )
+        transient_status = {429, 500, 502, 503, 504}
+        backoff = 0.1
+
         for attempt in range(self.max_retries):
             try:
                 if method.casefold() == "get":
-                    return requests.get(url, params=params, timeout=10)
+                    response = requests.get(
+                        url, params=params, headers=headers, timeout=(5, 30)
+                    )
                 elif method.casefold() == "post":
-                    return requests.post(url, json=params, timeout=10)
+                    response = requests.post(
+                        url, json=params, headers=headers, timeout=(5, 30)
+                    )
                 else:
                     raise ConnectionError(f"Unknown method for query: {method}")
-            except ChunkedEncodingError as e:
-                logger.warning(f"ChunkedEncodingError occurred for {url}: {e}")
+                if response.status_code in transient_status:
+                    logger.warning(
+                        f"{response.status_code} for {url} (attempt {attempt + 1}/{self.max_retries}); retrying in {backoff:.1f}s"
+                    )
+                    if attempt + 1 == self.max_retries:
+                        response.raise_for_status()
+                    sleep(backoff)
+                    backoff = min(60.0, backoff * 2)
+                    continue
+                elif 400 <= response.status_code < 500:
+                    response.raise_for_status()
+                if not parse_json:
+                    return response
+
+                try:
+                    return response.json()
+                except JSONDecodeError:
+                    logger.warning(
+                        f"JSONDecodeError for {response.url} "
+                        f"(attempt {attempt + 1}/{self.max_retries}); retrying in {backoff:.1f}s"
+                    )
+                    if attempt + 1 == self.max_retries:
+                        raise
+                    sleep(backoff)
+                    backoff = min(60.0, backoff * 2)
+                    continue
+
+            except retryable as e:
+                logger.warning(
+                    f"{e.__class__.__name__} for {url} (attempt {attempt + 1}/{self.max_retries}); "
+                    f"retrying in {backoff:.1f}s"
+                )
                 if attempt + 1 == self.max_retries:
-                    raise e
-                time.sleep(3)
+                    raise
+                sleep(backoff)
+                backoff = min(60.0, backoff * 2)
 
     def query(self, query, method="get", params=None):
         """Perform a direct query."""
 
-        r = self.request(urljoin(self.base, query), method, params=params)
-        r.raise_for_status()
-        return r.json()
+        return self.request(
+            urljoin(self.base, query), method, params=params, parse_json=True
+        )
 
-    def query_generator(self, query, method: str = "get", params: Dict = {}):
+    def query_generator(
+        self, query, method: str = "get", params: Optional[Dict] = None
+    ):
         """Query for a list of items, with paging. Returns a generator."""
 
-        try:
-            total = self.number_of_preprints()
-        except Exception:
-            total = float("inf")   # fallback if that call fails
-
-        page = 0
-        while True:
-            params.update(
-                {
-                    "limit": self.page_size,
-                    "skip": page * self.page_size,
-                    "searchDateFrom": self.start_date,
-                    "searchDateTo": self.end_date,
-                }
-            )
-            if page * self.page_size > total:
-                break
-            r = self.request(urljoin(self.base, query), method, params=params)
-            if r.status_code == 400:
-                raise ValueError(r.json()["message"])
-            r.raise_for_status()
-            r = r.json()
-            r = r["itemHits"]
-
-            # If we have no more results, bail out
-            if len(r) == 0:
-                return
-
-            yield from r
-            page += 1
+        start_datetime = datetime.fromisoformat(self.start_date)
+        end_datetime = datetime.fromisoformat(self.end_date)
+
+        def year_windows():
+            year = start_datetime.year
+            while year <= end_datetime.year:
+                year_start = datetime(year, 1, 1)
+                year_end = datetime(year, 12, 31)
+                win_start = max(start_datetime, year_start)
+                win_end = min(end_datetime, year_end)
+                yield win_start.strftime("%Y-%m-%d"), win_end.strftime("%Y-%m-%d")
+                year += 1
+
+        params = (params or {}).copy()
+
+        for year_from, year_to in year_windows():
+            logger.info(f"Starting to scrape data from {year_from} to {year_to}")
+            page = 0
+            while True:
+                params.update(
+                    {
+                        "limit": self.page_size,
+                        "skip": page * self.page_size,
+                        "searchDateFrom": year_from,
+                        "searchDateTo": year_to,
+                    }
+                )
+                try:
+                    data = self.request(
+                        urljoin(self.base, query),
+                        method,
+                        params=params,
+                        parse_json=True,
+                    )
+                except requests.HTTPError as e:
+                    status = getattr(e.response, "status_code", None)
+                    logger.warning(
+                        f"Stopping year window {year_from}..{year_to} at skip={page * self.page_size} "
+                        f"due to HTTPError {status}"
+                    )
+                    break
+                items = data.get("itemHits", [])
+                if not items:
+                    break
+                for item in items:
+                    yield item
+                page += 1
 
     def all_preprints(self):
         """Return a generator to all the chemRxiv articles."""
diff --git a/paperscraper/get_dumps/utils/chemrxiv/utils.py b/paperscraper/get_dumps/utils/chemrxiv/utils.py
@@ -7,9 +7,15 @@
 from datetime import datetime
 from typing import Dict, List, Optional
 
-from requests.exceptions import SSLError
+from requests.exceptions import (
+    ChunkedEncodingError,
+    ContentDecodingError,
+    JSONDecodeError,
+    SSLError,
+)
 from requests.models import HTTPError
 from tqdm import tqdm
+from urllib3.exceptions import DecodeError
 
 from .chemrxiv_api import ChemrxivAPI
 
@@ -127,20 +133,21 @@ def parse_dump(source_path: str, target_path: str) -> None:
 def download_full(save_dir: str, api: Optional[ChemrxivAPI] = None) -> None:
     if api is None:
         api = ChemrxivAPI()
-
     os.makedirs(save_dir, exist_ok=True)
+
     for preprint in tqdm(api.all_preprints()):
-        path = os.path.join(save_dir, f"{preprint['item']['id']}.json")
+        item = preprint["item"]
+        path = os.path.join(save_dir, f"{item['id']}.json")
         if os.path.exists(path):
             continue
-        preprint = preprint["item"]
-        preprint_id = preprint["id"]
-        try:
-            preprint = api.preprint(preprint_id)
-        except HTTPError:
-            logger.warning(f"HTTP API Client error for ID: {preprint_id}")
-        except SSLError:
-            logger.warning(f"SSLError for ID: {preprint_id}")
+
+        if not item.get("title") or "authors" not in item:
+            try:
+                item = api.preprint(item["id"])
+            except Exception as e:
+                logger.warning(
+                    f"Enrich failed for {item['id']}: {e}; writing listing payload"
+                )
 
         with open(path, "w") as file:
-            json.dump(preprint, file, indent=2)
+            json.dump(item, file, indent=2)