add retry logic in XRXivApi to tackle request timed out (#43)

memray · jannisborn · web-flow · commit d6b893d4fb2f · 2024-05-28T08:53:57.000+02:00
* add retry logic in XRXivApi to tackle request timed out

* chore: add timeout elsewhere in package

* refactor: parameterize max_retries

* chore: add request timeout

* chore: populate max_retries to top level

* chore: version bump

* ci: test tip when coming from fork

* doc: Update README

* test: add pytest for fetching logs

---------

Co-authored-by: jannisborn &lt;jab@zurich.ibm.com&gt;
diff --git a/.github/workflows/test_tip.yml b/.github/workflows/test_tip.yml
@@ -34,6 +34,7 @@ jobs:
     - name: Checkout code
       uses: actions/checkout@v2
       with:
+        repository: ${{ github.event.pull_request.head.repo.full_name }}
         ref: ${{ github.event.pull_request.head.ref }}
 
     - name: Set up Python 3.8
diff --git a/README.md b/README.md
@@ -17,7 +17,6 @@ publication metadata as well as full PDF files from **PubMed** or from preprint
 **medRxiv**, **bioRxiv** and **chemRxiv**. It provides a streamlined interface to scrape metadata and comes
 with simple postprocessing functions and plotting routines for meta-analysis.
 
-Since v0.2.4 `paperscraper` also supports scraping PDF files directly! Thanks to [@daenuprobst](https://github.com/daenuprobst) for suggestions!
 
 ## Getting started
 
@@ -37,8 +36,8 @@ medrxiv()  #  Takes ~30min and should result in ~35 MB file
 biorxiv()  # Takes ~1h and should result in ~350 MB file
 chemrxiv()  #  Takes ~45min and should result in ~20 MB file
 ```
-*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
-so that the changes take effect. 
+*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect. 
+*NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`, thanks to [@memray](https://github.com/memray) for contributions!
 
 Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
 ```py
diff --git a/paperscraper/__init__.py b/paperscraper/__init__.py
@@ -1,7 +1,7 @@
 """Initialize the module."""
 
 __name__ = "paperscraper"
-__version__ = "0.2.11"
+__version__ = "0.2.12"
 
 import logging
 import os
diff --git a/paperscraper/get_dumps/biorxiv.py b/paperscraper/get_dumps/biorxiv.py
@@ -1,4 +1,5 @@
 """Dump bioRxiv data in JSONL format."""
+
 import json
 import os
 from datetime import datetime
@@ -20,22 +21,25 @@ def biorxiv(
     begin_date: Optional[str] = None,
     end_date: Optional[str] = None,
     save_path: str = save_path,
+    max_retries: int = 10,
 ):
     """Fetches papers from biorxiv based on time range, i.e., begin_date and end_date.
     If the begin_date and end_date are not provided, papers will be fetched from biorxiv
     from the launch date of biorxiv until the current date. The fetched papers will be
     stored in jsonl format in save_path.
 
     Args:
+        begin_date (str, optional): begin date expressed as YYYY-MM-DD.
+            Defaults to None, i.e., earliest possible.
+        end_date (str, optional): end date expressed as YYYY-MM-DD.
+            Defaults to None, i.e., today.
         save_path (str, optional): Path where the dump is stored.
             Defaults to save_path.
-        begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
-            Defaults to None.
-        end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
-            Defaults to None.
+        max_retries (int, optional): Number of retries when API shows connection issues.
+            Defaults to 10.
     """
     # create API client
-    api = BioRxivApi()
+    api = BioRxivApi(max_retries=max_retries)
 
     # dump all papers
     with open(save_path, "w") as fp:
diff --git a/paperscraper/get_dumps/chemrxiv.py b/paperscraper/get_dumps/chemrxiv.py
@@ -1,4 +1,5 @@
 """Dump chemRxiv data in JSONL format."""
+
 import logging
 import os
 import sys
@@ -28,11 +29,11 @@ def chemrxiv(
     stored in jsonl format in save_path.
 
     Args:
-        begin_date (Optional[str]): begin date expressed as YYYY-MM-DD.
-            Defaults to None.
-        end_date (Optional[str]): end date expressed as YYYY-MM-DD.
-            Defaults to None.
-        save_path (str): Path where the dump is stored.
+        begin_date (str, optional): begin date expressed as YYYY-MM-DD.
+            Defaults to None, i.e., earliest possible.
+        end_date (str, optional): end date expressed as YYYY-MM-DD.
+            Defaults to None, i.e., today.
+        save_path (str, optional): Path where the dump is stored.
             Defaults to save_path.
     """
 
diff --git a/paperscraper/get_dumps/medrxiv.py b/paperscraper/get_dumps/medrxiv.py
@@ -1,4 +1,5 @@
 """Dump medrxiv data in JSONL format."""
+
 import json
 import os
 from datetime import datetime
@@ -18,22 +19,25 @@ def medrxiv(
     begin_date: Optional[str] = None,
     end_date: Optional[str] = None,
     save_path: str = save_path,
+    max_retries: int = 10,
 ):
     """Fetches papers from medrxiv based on time range, i.e., begin_date and end_date.
     If the begin_date and end_date are not provided, then papers will be fetched from
     medrxiv starting from the launch date of medrxiv until current date. The fetched
     papers will be stored in jsonl format in save_path.
 
     Args:
+        begin_date (str, optional): begin date expressed as YYYY-MM-DD.
+            Defaults to None, i.e., earliest possible.
+        end_date (str, optional): end date expressed as YYYY-MM-DD.
+            Defaults to None, i.e., today.
         save_path (str, optional): Path where the dump is stored.
             Defaults to save_path.
-        begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
-            Defaults to None.
-        end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
-            Defaults to None.
+        max_retries (int, optional): Number of retries when API shows connection issues.
+            Defaults to 10.
     """
     # create API client
-    api = MedRxivApi()
+    api = MedRxivApi(max_retries=max_retries)
     # dump all papers
     with open(save_path, "w") as fp:
         for index, paper in enumerate(
diff --git a/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py b/paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py
@@ -71,9 +71,9 @@ def request(self, url, method, params=None):
         """Send an API request to open Engage."""
 
         if method.casefold() == "get":
-            return requests.get(url, params=params)
+            return requests.get(url, params=params, timeout=10)
         elif method.casefold() == "post":
-            return requests.post(url, json=params)
+            return requests.post(url, json=params, timeout=10)
         else:
             raise ConnectionError(f"Unknown method for query: {method}")
 
diff --git a/paperscraper/pdf.py b/paperscraper/pdf.py
@@ -38,7 +38,7 @@ def save_pdf(paper_metadata: Dict[str, Any], filepath: str) -> None:
 
     url = f"https://doi.org/{paper_metadata['doi']}"
     try:
-        response = requests.get(url)
+        response = requests.get(url, timeout=60)
     except Exception:
         logger.warning(f"Could not download {url}.")
         return
@@ -55,7 +55,7 @@ def save_pdf(paper_metadata: Dict[str, Any], filepath: str) -> None:
     pdf_url = metas.attrs.get("content")
 
     try:
-        response = requests.get(pdf_url)
+        response = requests.get(pdf_url, timeout=60)
     except Exception:
         logger.warning(f"Could not download {pdf_url}.")
         return
diff --git a/paperscraper/tests/test_dumpy.py b/paperscraper/tests/test_dumpy.py
@@ -0,0 +1,48 @@
+import logging
+import threading
+
+import pytest
+
+from paperscraper.get_dumps import biorxiv, medrxiv
+
+logging.disable(logging.INFO)
+
+
+class TestDumper:
+    @pytest.fixture
+    def setup_medrxiv(self):
+        return medrxiv
+
+    @pytest.fixture
+    def setup_biorxiv(self):
+        return lambda: biorxiv(max_retries=2)
+
+    def run_function_with_timeout(self, func, timeout):
+        # Define the target function for the thread
+        def target():
+            func()
+
+        # Create a daemon thread that runs the target function
+        thread = threading.Thread(target=target)
+        thread.daemon = True  # This makes the thread exit when the main thread exits
+        thread.start()
+        thread.join(
+            timeout=timeout
+        )  # Wait for the specified time or until the function finishes
+        if thread.is_alive():
+            return True  # Function is still running, which is our success condition
+        return False  # Function has completed or failed within the timeout, which we don't expect
+
+    @pytest.mark.timeout(30)
+    def test_medrxiv(self, setup_medrxiv):
+        # Check that the function runs for at least 15 seconds
+        assert self.run_function_with_timeout(
+            setup_medrxiv, 15
+        ), "medrxiv should still be running after 15 seconds"
+
+    @pytest.mark.timeout(30)
+    def test_biorxiv(self, setup_biorxiv):
+        # Check that the function runs for at least 15 seconds
+        assert self.run_function_with_timeout(
+            setup_biorxiv, 15
+        ), "biorxiv should still be running after 15 seconds"
diff --git a/paperscraper/xrxiv/xrxiv_api.py b/paperscraper/xrxiv/xrxiv_api.py
@@ -1,8 +1,12 @@
 """API for bioRxiv and medRXiv."""
+
 import logging
+import time
 from datetime import datetime
+from functools import wraps
 from time import sleep
 from typing import Generator, List, Optional
+from urllib.error import HTTPError
 
 import requests
 from requests.exceptions import ConnectionError, Timeout
@@ -11,6 +15,33 @@
 logger = logging.getLogger(__name__)
 
 
+def retry_multi():
+    """Retry a function several times"""
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            num_retries = 0
+            max_retries = getattr(self, "max_retries", 10)
+            while num_retries <= max_retries:
+                try:
+                    ret = func(self, *args, **kwargs)
+                    if ret is None:
+                        time.sleep(5)
+                        continue
+                    break
+                except HTTPError:
+                    if num_retries == max_retries:
+                        raise
+                    num_retries += 1
+                    time.sleep(5)
+            return ret
+
+        return wrapper
+
+    return decorator
+
+
 class XRXivApi:
     """API class."""
 
@@ -19,6 +50,7 @@ def __init__(
         server: str,
         launch_date: str,
         api_base_url: str = "https://api.biorxiv.org",
+        max_retries: int = 10,
     ):
         """
         Initialize API class.
@@ -27,6 +59,8 @@ def __init__(
             server (str): name of the preprint server to access.
             launch_date (str): launch date expressed as YYYY-MM-DD.
             api_base_url (str, optional): Base url for the API. Defaults to 'api.biorxiv.org'.
+            max_retries (int, optional): Maximal number of retries for a request before an
+                error is raised. Defaults to 10.
         """
         self.server = server
         self.api_base_url = api_base_url
@@ -36,6 +70,22 @@ def __init__(
             "{}/details/{}".format(self.api_base_url, self.server)
             + "/{begin_date}/{end_date}/{cursor}"
         )
+        self.max_retries = max_retries
+
+    @retry_multi()
+    def call_api(self, begin_date, end_date, cursor):
+        try:
+            json_response = requests.get(
+                self.get_papers_url.format(
+                    begin_date=begin_date, end_date=end_date, cursor=cursor
+                ),
+                timeout=10,
+            ).json()
+        except requests.exceptions.Timeout:
+            logger.info("Timed out, will retry")
+            return None
+
+        return json_response
 
     def get_papers(
         self,
@@ -77,11 +127,7 @@ def get_papers(
                 papers = []
                 for attempt in range(max_retries):
                     try:
-                        json_response = requests.get(
-                            self.get_papers_url.format(
-                                begin_date=begin_date, end_date=end_date, cursor=cursor
-                            )
-                        ).json()
+                        json_response = self.call_api(begin_date, end_date, cursor)
                         do_loop = json_response["messages"][0]["status"] == "ok"
                         if do_loop:
                             cursor += json_response["messages"][0]["count"]
@@ -102,27 +148,27 @@ def get_papers(
                         continue
                     except Exception as exc:
                         logger.exception(f"Failed getting papers: {exc}")
-                        raise RuntimeError(
-                            "Failed getting papers: {} - {}".format(
-                                exc.__class__.__name__, exc
-                            )
-                        )
         except Exception as exc:
             logger.exception(f"Failed getting papers: {exc}")
-            raise RuntimeError(
-                "Failed getting papers: {} - {}".format(exc.__class__.__name__, exc)
-            )
 
 
 class BioRxivApi(XRXivApi):
     """bioRxiv API."""
 
-    def __init__(self):
-        super().__init__(server="biorxiv", launch_date=launch_dates["biorxiv"])
+    def __init__(self, max_retries: int = 10):
+        super().__init__(
+            server="biorxiv",
+            launch_date=launch_dates["biorxiv"],
+            max_retries=max_retries,
+        )
 
 
 class MedRxivApi(XRXivApi):
     """medRxiv API."""
 
-    def __init__(self):
-        super().__init__(server="medrxiv", launch_date=launch_dates["medrxiv"])
+    def __init__(self, max_retries: int = 10):
+        super().__init__(
+            server="medrxiv",
+            launch_date=launch_dates["medrxiv"],
+            max_retries=max_retries,
+        )