Skip to content

Commit d6b893d

Browse files
memrayjannisborn
andauthored
add retry logic in XRXivApi to tackle request timed out (#43)
* add retry logic in XRXivApi to tackle request timed out * chore: add timeout elsewhere in package * refactor: parameterize max_retries * chore: add request timeout * chore: populate max_retries to top level * chore: version bump * ci: test tip when coming from fork * doc: Update README * test: add pytest for fetching logs --------- Co-authored-by: jannisborn <[email protected]>
1 parent 444eb13 commit d6b893d

File tree

10 files changed

+143
-40
lines changed

10 files changed

+143
-40
lines changed

.github/workflows/test_tip.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ jobs:
3434
- name: Checkout code
3535
uses: actions/checkout@v2
3636
with:
37+
repository: ${{ github.event.pull_request.head.repo.full_name }}
3738
ref: ${{ github.event.pull_request.head.ref }}
3839

3940
- name: Set up Python 3.8

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ publication metadata as well as full PDF files from **PubMed** or from preprint
1717
**medRxiv**, **bioRxiv** and **chemRxiv**. It provides a streamlined interface to scrape metadata and comes
1818
with simple postprocessing functions and plotting routines for meta-analysis.
1919

20-
Since v0.2.4 `paperscraper` also supports scraping PDF files directly! Thanks to [@daenuprobst](https://github.com/daenuprobst) for suggestions!
2120

2221
## Getting started
2322

@@ -37,8 +36,8 @@ medrxiv() # Takes ~30min and should result in ~35 MB file
3736
biorxiv() # Takes ~1h and should result in ~350 MB file
3837
chemrxiv() # Takes ~45min and should result in ~20 MB file
3938
```
40-
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter
41-
so that the changes take effect.
39+
*NOTE*: Once the dumps are stored, please make sure to restart the python interpreter so that the changes take effect.
40+
*NOTE*: If you experience API connection issues (`ConnectionError`), since v0.2.12 there are automatic retries which you can even control and raise from the default of 10, as in `biorxiv(max_retries=20)`, thanks to [@memray](https://github.com/memray) for contributions!
4241

4342
Since v0.2.5 `paperscraper` also allows to scrape {med/bio/chem}rxiv for specific dates! Thanks to [@achouhan93 ](https://github.com/achouhan93 ) for contributions!
4443
```py

paperscraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Initialize the module."""
22

33
__name__ = "paperscraper"
4-
__version__ = "0.2.11"
4+
__version__ = "0.2.12"
55

66
import logging
77
import os

paperscraper/get_dumps/biorxiv.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Dump bioRxiv data in JSONL format."""
2+
23
import json
34
import os
45
from datetime import datetime
@@ -20,22 +21,25 @@ def biorxiv(
2021
begin_date: Optional[str] = None,
2122
end_date: Optional[str] = None,
2223
save_path: str = save_path,
24+
max_retries: int = 10,
2325
):
2426
"""Fetches papers from biorxiv based on time range, i.e., begin_date and end_date.
2527
If the begin_date and end_date are not provided, papers will be fetched from biorxiv
2628
from the launch date of biorxiv until the current date. The fetched papers will be
2729
stored in jsonl format in save_path.
2830
2931
Args:
32+
begin_date (str, optional): begin date expressed as YYYY-MM-DD.
33+
Defaults to None, i.e., earliest possible.
34+
end_date (str, optional): end date expressed as YYYY-MM-DD.
35+
Defaults to None, i.e., today.
3036
save_path (str, optional): Path where the dump is stored.
3137
Defaults to save_path.
32-
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
33-
Defaults to None.
34-
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
35-
Defaults to None.
38+
max_retries (int, optional): Number of retries when API shows connection issues.
39+
Defaults to 10.
3640
"""
3741
# create API client
38-
api = BioRxivApi()
42+
api = BioRxivApi(max_retries=max_retries)
3943

4044
# dump all papers
4145
with open(save_path, "w") as fp:

paperscraper/get_dumps/chemrxiv.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Dump chemRxiv data in JSONL format."""
2+
23
import logging
34
import os
45
import sys
@@ -28,11 +29,11 @@ def chemrxiv(
2829
stored in jsonl format in save_path.
2930
3031
Args:
31-
begin_date (Optional[str]): begin date expressed as YYYY-MM-DD.
32-
Defaults to None.
33-
end_date (Optional[str]): end date expressed as YYYY-MM-DD.
34-
Defaults to None.
35-
save_path (str): Path where the dump is stored.
32+
begin_date (str, optional): begin date expressed as YYYY-MM-DD.
33+
Defaults to None, i.e., earliest possible.
34+
end_date (str, optional): end date expressed as YYYY-MM-DD.
35+
Defaults to None, i.e., today.
36+
save_path (str, optional): Path where the dump is stored.
3637
Defaults to save_path.
3738
"""
3839

paperscraper/get_dumps/medrxiv.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Dump medrxiv data in JSONL format."""
2+
23
import json
34
import os
45
from datetime import datetime
@@ -18,22 +19,25 @@ def medrxiv(
1819
begin_date: Optional[str] = None,
1920
end_date: Optional[str] = None,
2021
save_path: str = save_path,
22+
max_retries: int = 10,
2123
):
2224
"""Fetches papers from medrxiv based on time range, i.e., begin_date and end_date.
2325
If the begin_date and end_date are not provided, then papers will be fetched from
2426
medrxiv starting from the launch date of medrxiv until current date. The fetched
2527
papers will be stored in jsonl format in save_path.
2628
2729
Args:
30+
begin_date (str, optional): begin date expressed as YYYY-MM-DD.
31+
Defaults to None, i.e., earliest possible.
32+
end_date (str, optional): end date expressed as YYYY-MM-DD.
33+
Defaults to None, i.e., today.
2834
save_path (str, optional): Path where the dump is stored.
2935
Defaults to save_path.
30-
begin_date (Optional[str], optional): begin date expressed as YYYY-MM-DD.
31-
Defaults to None.
32-
end_date (Optional[str], optional): end date expressed as YYYY-MM-DD.
33-
Defaults to None.
36+
max_retries (int, optional): Number of retries when API shows connection issues.
37+
Defaults to 10.
3438
"""
3539
# create API client
36-
api = MedRxivApi()
40+
api = MedRxivApi(max_retries=max_retries)
3741
# dump all papers
3842
with open(save_path, "w") as fp:
3943
for index, paper in enumerate(

paperscraper/get_dumps/utils/chemrxiv/chemrxiv_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ def request(self, url, method, params=None):
7171
"""Send an API request to open Engage."""
7272

7373
if method.casefold() == "get":
74-
return requests.get(url, params=params)
74+
return requests.get(url, params=params, timeout=10)
7575
elif method.casefold() == "post":
76-
return requests.post(url, json=params)
76+
return requests.post(url, json=params, timeout=10)
7777
else:
7878
raise ConnectionError(f"Unknown method for query: {method}")
7979

paperscraper/pdf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def save_pdf(paper_metadata: Dict[str, Any], filepath: str) -> None:
3838

3939
url = f"https://doi.org/{paper_metadata['doi']}"
4040
try:
41-
response = requests.get(url)
41+
response = requests.get(url, timeout=60)
4242
except Exception:
4343
logger.warning(f"Could not download {url}.")
4444
return
@@ -55,7 +55,7 @@ def save_pdf(paper_metadata: Dict[str, Any], filepath: str) -> None:
5555
pdf_url = metas.attrs.get("content")
5656

5757
try:
58-
response = requests.get(pdf_url)
58+
response = requests.get(pdf_url, timeout=60)
5959
except Exception:
6060
logger.warning(f"Could not download {pdf_url}.")
6161
return

paperscraper/tests/test_dumpy.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import logging
2+
import threading
3+
4+
import pytest
5+
6+
from paperscraper.get_dumps import biorxiv, medrxiv
7+
8+
logging.disable(logging.INFO)
9+
10+
11+
class TestDumper:
12+
@pytest.fixture
13+
def setup_medrxiv(self):
14+
return medrxiv
15+
16+
@pytest.fixture
17+
def setup_biorxiv(self):
18+
return lambda: biorxiv(max_retries=2)
19+
20+
def run_function_with_timeout(self, func, timeout):
21+
# Define the target function for the thread
22+
def target():
23+
func()
24+
25+
# Create a daemon thread that runs the target function
26+
thread = threading.Thread(target=target)
27+
thread.daemon = True # This makes the thread exit when the main thread exits
28+
thread.start()
29+
thread.join(
30+
timeout=timeout
31+
) # Wait for the specified time or until the function finishes
32+
if thread.is_alive():
33+
return True # Function is still running, which is our success condition
34+
return False # Function has completed or failed within the timeout, which we don't expect
35+
36+
@pytest.mark.timeout(30)
37+
def test_medrxiv(self, setup_medrxiv):
38+
# Check that the function runs for at least 15 seconds
39+
assert self.run_function_with_timeout(
40+
setup_medrxiv, 15
41+
), "medrxiv should still be running after 15 seconds"
42+
43+
@pytest.mark.timeout(30)
44+
def test_biorxiv(self, setup_biorxiv):
45+
# Check that the function runs for at least 15 seconds
46+
assert self.run_function_with_timeout(
47+
setup_biorxiv, 15
48+
), "biorxiv should still be running after 15 seconds"

paperscraper/xrxiv/xrxiv_api.py

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
"""API for bioRxiv and medRXiv."""
2+
23
import logging
4+
import time
35
from datetime import datetime
6+
from functools import wraps
47
from time import sleep
58
from typing import Generator, List, Optional
9+
from urllib.error import HTTPError
610

711
import requests
812
from requests.exceptions import ConnectionError, Timeout
@@ -11,6 +15,33 @@
1115
logger = logging.getLogger(__name__)
1216

1317

18+
def retry_multi():
19+
"""Retry a function several times"""
20+
21+
def decorator(func):
22+
@wraps(func)
23+
def wrapper(self, *args, **kwargs):
24+
num_retries = 0
25+
max_retries = getattr(self, "max_retries", 10)
26+
while num_retries <= max_retries:
27+
try:
28+
ret = func(self, *args, **kwargs)
29+
if ret is None:
30+
time.sleep(5)
31+
continue
32+
break
33+
except HTTPError:
34+
if num_retries == max_retries:
35+
raise
36+
num_retries += 1
37+
time.sleep(5)
38+
return ret
39+
40+
return wrapper
41+
42+
return decorator
43+
44+
1445
class XRXivApi:
1546
"""API class."""
1647

@@ -19,6 +50,7 @@ def __init__(
1950
server: str,
2051
launch_date: str,
2152
api_base_url: str = "https://api.biorxiv.org",
53+
max_retries: int = 10,
2254
):
2355
"""
2456
Initialize API class.
@@ -27,6 +59,8 @@ def __init__(
2759
server (str): name of the preprint server to access.
2860
launch_date (str): launch date expressed as YYYY-MM-DD.
2961
api_base_url (str, optional): Base url for the API. Defaults to 'api.biorxiv.org'.
62+
max_retries (int, optional): Maximal number of retries for a request before an
63+
error is raised. Defaults to 10.
3064
"""
3165
self.server = server
3266
self.api_base_url = api_base_url
@@ -36,6 +70,22 @@ def __init__(
3670
"{}/details/{}".format(self.api_base_url, self.server)
3771
+ "/{begin_date}/{end_date}/{cursor}"
3872
)
73+
self.max_retries = max_retries
74+
75+
@retry_multi()
76+
def call_api(self, begin_date, end_date, cursor):
77+
try:
78+
json_response = requests.get(
79+
self.get_papers_url.format(
80+
begin_date=begin_date, end_date=end_date, cursor=cursor
81+
),
82+
timeout=10,
83+
).json()
84+
except requests.exceptions.Timeout:
85+
logger.info("Timed out, will retry")
86+
return None
87+
88+
return json_response
3989

4090
def get_papers(
4191
self,
@@ -77,11 +127,7 @@ def get_papers(
77127
papers = []
78128
for attempt in range(max_retries):
79129
try:
80-
json_response = requests.get(
81-
self.get_papers_url.format(
82-
begin_date=begin_date, end_date=end_date, cursor=cursor
83-
)
84-
).json()
130+
json_response = self.call_api(begin_date, end_date, cursor)
85131
do_loop = json_response["messages"][0]["status"] == "ok"
86132
if do_loop:
87133
cursor += json_response["messages"][0]["count"]
@@ -102,27 +148,27 @@ def get_papers(
102148
continue
103149
except Exception as exc:
104150
logger.exception(f"Failed getting papers: {exc}")
105-
raise RuntimeError(
106-
"Failed getting papers: {} - {}".format(
107-
exc.__class__.__name__, exc
108-
)
109-
)
110151
except Exception as exc:
111152
logger.exception(f"Failed getting papers: {exc}")
112-
raise RuntimeError(
113-
"Failed getting papers: {} - {}".format(exc.__class__.__name__, exc)
114-
)
115153

116154

117155
class BioRxivApi(XRXivApi):
118156
"""bioRxiv API."""
119157

120-
def __init__(self):
121-
super().__init__(server="biorxiv", launch_date=launch_dates["biorxiv"])
158+
def __init__(self, max_retries: int = 10):
159+
super().__init__(
160+
server="biorxiv",
161+
launch_date=launch_dates["biorxiv"],
162+
max_retries=max_retries,
163+
)
122164

123165

124166
class MedRxivApi(XRXivApi):
125167
"""medRxiv API."""
126168

127-
def __init__(self):
128-
super().__init__(server="medrxiv", launch_date=launch_dates["medrxiv"])
169+
def __init__(self, max_retries: int = 10):
170+
super().__init__(
171+
server="medrxiv",
172+
launch_date=launch_dates["medrxiv"],
173+
max_retries=max_retries,
174+
)

0 commit comments

Comments
 (0)